| New file |
| | |
| | | { |
| | | "train_micro_batch_size_per_gpu": 1, |
| | | "gradient_accumulation_steps": 1, |
| | | "steps_per_print": 100, |
| | | "gradient_clipping": 5, |
| | | "fp16": { |
| | | "enabled": false, |
| | | "auto_cast": false, |
| | | "loss_scale": 0, |
| | | "initial_scale_power": 16, |
| | | "loss_scale_window": 1000, |
| | | "hysteresis": 2, |
| | | "consecutive_hysteresis": false, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": true |
| | | }, |
| | | "zero_force_ds_cpu_optimizer": false, |
| | | "zero_optimization": { |
| | | "stage": 1, |
| | | "offload_optimizer": { |
| | | "device": "none", |
| | | "pin_memory": true |
| | | }, |
| | | "allgather_partitions": true, |
| | | "allgather_bucket_size": 5e8, |
| | | "overlap_comm": true, |
| | | "reduce_scatter": true, |
| | | "reduce_bucket_size": 5e8, |
| | | "contiguous_gradients" : true |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_micro_batch_size_per_gpu": 1, |
| | | "gradient_accumulation_steps": 1, |
| | | "steps_per_print": 100, |
| | | "gradient_clipping": 5, |
| | | "fp16": { |
| | | "enabled": false, |
| | | "auto_cast": false, |
| | | "loss_scale": 0, |
| | | "initial_scale_power": 16, |
| | | "loss_scale_window": 1000, |
| | | "hysteresis": 2, |
| | | "consecutive_hysteresis": false, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": true |
| | | }, |
| | | "zero_force_ds_cpu_optimizer": false, |
| | | "zero_optimization": { |
| | | "stage": 2, |
| | | "offload_optimizer": { |
| | | "device": "none", |
| | | "pin_memory": true |
| | | }, |
| | | "allgather_partitions": true, |
| | | "allgather_bucket_size": 5e8, |
| | | "overlap_comm": false, |
| | | "reduce_scatter": true, |
| | | "reduce_bucket_size": 5e8, |
| | | "contiguous_gradients" : true |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_micro_batch_size_per_gpu": 1, |
| | | "gradient_accumulation_steps": 1, |
| | | "steps_per_print": 100, |
| | | "gradient_clipping": 5, |
| | | "fp16": { |
| | | "enabled": false, |
| | | "auto_cast": false, |
| | | "loss_scale": 0, |
| | | "initial_scale_power": 16, |
| | | "loss_scale_window": 1000, |
| | | "hysteresis": 2, |
| | | "consecutive_hysteresis": false, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": true |
| | | }, |
| | | "zero_force_ds_cpu_optimizer": false, |
| | | "zero_optimization": { |
| | | "stage": 3, |
| | | "offload_optimizer": { |
| | | "device": "none", |
| | | "pin_memory": true |
| | | }, |
| | | "offload_param": { |
| | | "device": "none", |
| | | "pin_memory": true |
| | | }, |
| | | "allgather_partitions": true, |
| | | "allgather_bucket_size": 5e8, |
| | | "overlap_comm": true, |
| | | "reduce_scatter": true, |
| | | "reduce_bucket_size": 5e8, |
| | | "contiguous_gradients" : true, |
| | | "stage3_max_live_parameters": 1e9, |
| | | "stage3_max_reuse_distance": 1e9, |
| | | "stage3_prefetch_bucket_size": 5e8, |
| | | "stage3_param_persistence_threshold": 1e5 |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_batch_size": "auto", |
| | | "train_micro_batch_size_per_gpu": "auto", |
| | | "gradient_accumulation_steps": "auto", |
| | | "gradient_clipping": "auto", |
| | | "zero_allow_untested_optimizer": true, |
| | | "fp16": { |
| | | "enabled": "auto", |
| | | "loss_scale": 0, |
| | | "loss_scale_window": 1000, |
| | | "initial_scale_power": 16, |
| | | "hysteresis": 2, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": "auto" |
| | | }, |
| | | "zero_optimization": { |
| | | "stage": 0, |
| | | "allgather_partitions": true, |
| | | "allgather_bucket_size": 5e8, |
| | | "overlap_comm": true, |
| | | "reduce_scatter": true, |
| | | "reduce_bucket_size": 5e8, |
| | | "contiguous_gradients": true, |
| | | "round_robin_gradients": true |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_batch_size": "auto", |
| | | "train_micro_batch_size_per_gpu": "auto", |
| | | "gradient_accumulation_steps": "auto", |
| | | "gradient_clipping": "auto", |
| | | "zero_allow_untested_optimizer": true, |
| | | "fp16": { |
| | | "enabled": "auto", |
| | | "loss_scale": 0, |
| | | "loss_scale_window": 1000, |
| | | "initial_scale_power": 16, |
| | | "hysteresis": 2, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": "auto" |
| | | }, |
| | | "zero_optimization": { |
| | | "stage": 2, |
| | | "allgather_partitions": true, |
| | | "allgather_bucket_size": 5e8, |
| | | "overlap_comm": true, |
| | | "reduce_scatter": true, |
| | | "reduce_bucket_size": 5e8, |
| | | "contiguous_gradients": true, |
| | | "round_robin_gradients": true |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_batch_size": "auto", |
| | | "train_micro_batch_size_per_gpu": "auto", |
| | | "gradient_accumulation_steps": "auto", |
| | | "gradient_clipping": "auto", |
| | | "zero_allow_untested_optimizer": true, |
| | | "fp16": { |
| | | "enabled": "auto", |
| | | "loss_scale": 0, |
| | | "loss_scale_window": 1000, |
| | | "initial_scale_power": 16, |
| | | "hysteresis": 2, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": "auto" |
| | | }, |
| | | "zero_optimization": { |
| | | "stage": 2, |
| | | "offload_optimizer": { |
| | | "device": "cpu", |
| | | "pin_memory": true |
| | | }, |
| | | "allgather_partitions": true, |
| | | "allgather_bucket_size": 5e8, |
| | | "overlap_comm": true, |
| | | "reduce_scatter": true, |
| | | "reduce_bucket_size": 5e8, |
| | | "contiguous_gradients": true, |
| | | "round_robin_gradients": true |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_batch_size": "auto", |
| | | "train_micro_batch_size_per_gpu": "auto", |
| | | "gradient_accumulation_steps": "auto", |
| | | "gradient_clipping": "auto", |
| | | "zero_allow_untested_optimizer": true, |
| | | "fp16": { |
| | | "enabled": "auto", |
| | | "loss_scale": 0, |
| | | "loss_scale_window": 1000, |
| | | "initial_scale_power": 16, |
| | | "hysteresis": 2, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": "auto" |
| | | }, |
| | | "zero_optimization": { |
| | | "stage": 3, |
| | | "overlap_comm": true, |
| | | "contiguous_gradients": true, |
| | | "sub_group_size": 1e9, |
| | | "reduce_bucket_size": "auto", |
| | | "stage3_prefetch_bucket_size": "auto", |
| | | "stage3_param_persistence_threshold": "auto", |
| | | "stage3_max_live_parameters": 1e9, |
| | | "stage3_max_reuse_distance": 1e9, |
| | | "stage3_gather_16bit_weights_on_model_save": true |
| | | } |
| | | } |
| New file |
| | |
| | | { |
| | | "train_batch_size": "auto", |
| | | "train_micro_batch_size_per_gpu": "auto", |
| | | "gradient_accumulation_steps": "auto", |
| | | "gradient_clipping": "auto", |
| | | "zero_allow_untested_optimizer": true, |
| | | "fp16": { |
| | | "enabled": "auto", |
| | | "loss_scale": 0, |
| | | "loss_scale_window": 1000, |
| | | "initial_scale_power": 16, |
| | | "hysteresis": 2, |
| | | "min_loss_scale": 1 |
| | | }, |
| | | "bf16": { |
| | | "enabled": "auto" |
| | | }, |
| | | "zero_optimization": { |
| | | "stage": 3, |
| | | "offload_optimizer": { |
| | | "device": "cpu", |
| | | "pin_memory": true |
| | | }, |
| | | "offload_param": { |
| | | "device": "cpu", |
| | | "pin_memory": true |
| | | }, |
| | | "overlap_comm": true, |
| | | "contiguous_gradients": true, |
| | | "sub_group_size": 1e9, |
| | | "reduce_bucket_size": "auto", |
| | | "stage3_prefetch_bucket_size": "auto", |
| | | "stage3_param_persistence_threshold": "auto", |
| | | "stage3_max_live_parameters": 1e9, |
| | | "stage3_max_reuse_distance": 1e9, |
| | | "stage3_gather_16bit_weights_on_model_save": true |
| | | } |
| | | } |
| | |
| | | # audio_adaptor |
| | | encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens) |
| | | |
| | | input_ids[input_ids == -1] = 0 |
| | | input_ids[input_ids == -100] = 0 |
| | | if hasattr(self.llm.model, "embed_tokens"): |
| | | inputs_embeds = self.llm.model.embed_tokens(input_ids) |
| | | elif hasattr(self.llm.model.model, "embed_tokens"): |
| | | inputs_embeds = self.llm.model.model.embed_tokens(input_ids) |
| | | else: |
| | | inputs_embeds = self.llm.model.model.model.embed_tokens(input_ids) |
| | | input_ids[input_ids < 0] = 0 |
| | | inputs_embeds = self.llm.model.get_input_embeddings()(input_ids) |
| | | |
| | | batch_size, token_num, dims = inputs_embeds.shape |
| | | _, l, _ = encoder_out.shape |
| | |
| | | inputs_embeds[batch_idx, fbank_beg_idx : fbank_beg_idx + l, :] = encoder_out[ |
| | | batch_idx, :l, : |
| | | ] |
| | | |
| | | labels_ids[labels_ids == -1] = -100 |
| | | model_outputs = self.llm( |
| | | inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels_ids |
| | | ) |