zhifu gao
2023-05-05 e7c379d0a09b34b651b824863ca8e7972b515c71
Merge pull request #461 from alibaba-damo-academy/dev_sx

Update contextual finetune and bias_grad_times
3个文件已修改
23 ■■■■■ 已修改文件
egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py 3 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/tasks/abs_task.py 6 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/train/trainer.py 14 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py
@@ -14,6 +14,7 @@
    ds_dict = MsDataset.load(params.data_path)
    kwargs = dict(
        model=params.model,
        model_revision="v1.0.2",
        data_dir=ds_dict,
        dataset_type=params.dataset_type,
        work_dir=params.output_dir,
@@ -31,6 +32,6 @@
    params.dataset_type = "large"                   # finetune contextual paraformer模型只能使用large dataset
    params.batch_bins = 200000                      # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
    params.max_epoch = 20                           # 最大训练轮数
    params.lr = 0.00005                             # 设置学习率
    params.lr = 0.0002                              # 设置学习率
    modelscope_finetune(params)
funasr/tasks/abs_task.py
@@ -549,6 +549,12 @@
            help="The number of gradient accumulation",
        )
        group.add_argument(
            "--bias_grad_times",
            type=float,
            default=1.0,
            help="To scale the gradient of contextual related params",
        )
        group.add_argument(
            "--no_forward_run",
            type=str2bool,
            default=False,
funasr/train/trainer.py
@@ -95,6 +95,7 @@
    use_pai: bool
    oss_bucket: Union[oss2.Bucket, None]
    batch_interval: int
    bias_grad_times: float
class Trainer:
    """Trainer having a optimizer.
@@ -546,8 +547,11 @@
        no_forward_run = options.no_forward_run
        ngpu = options.ngpu
        use_wandb = options.use_wandb
        bias_grad_times = options.bias_grad_times
        distributed = distributed_option.distributed
        if bias_grad_times != 1.0:
            logging.warning("Using bias_grad_times: {} for gradient scaling".format(bias_grad_times))
        if log_interval is None:
            try:
                log_interval = max(len(iterator) // 20, 10)
@@ -690,6 +694,16 @@
                        scale_factor=0.55,
                    )
                # for contextual training
                if bias_grad_times != 1.0:
                    # contextual related parameter names
                    cr_pnames = ["bias_encoder", "bias_embed", "decoder.bias_decoder", "decoder.bias_output"]
                    for name, param in model.named_parameters():
                        for cr_pname in cr_pnames:
                            if cr_pname in name:
                                param.grad *= bias_grad_times
                                continue
                # compute the gradient norm to check if it is normal or not
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(),