python/FunASR-XL.git

			@@ -85,7 +85,12 @@
			self.batch_total = 0
			self.use_fp16 = use_fp16
			self.save_checkpoint_interval = kwargs.get("save_checkpoint_interval", 5000)
			self.validate_interval = kwargs.get("validate_interval", 5000)
			self.validate_interval = kwargs.get("validate_interval", -1)
			if self.validate_interval < 0:
			self.validate_interval = self.save_checkpoint_interval
			assert (
			self.save_checkpoint_interval == self.validate_interval
			), f"save_checkpoint_interval must equal to validate_interval"
			self.keep_nbest_models = kwargs.get("keep_nbest_models", 500)
			self.avg_keep_nbest_models_type = kwargs.get("avg_keep_nbest_models_type", "acc")
			self.avg_nbest_model = kwargs.get("avg_nbest_model", 10)
			@@ -357,10 +362,10 @@
			time_beg = time.perf_counter()
			time5 = time_beg
			for batch_idx, batch in enumerate(dataloader_train):
			if self.use_ddp or self.use_fsdp:
			dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
			if iterator_stop > 0:
			break
			# if self.use_ddp or self.use_fsdp:
			# dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
			# if iterator_stop > 0:
			# break
			self.batch_total += 1
			self.step_in_epoch += 1
			time1 = time.perf_counter()
			@@ -376,11 +381,11 @@
			with maybe_autocast(self.use_fp16):
			retval = model(**batch)

			if (
			self.reset_gpu_cache
			and (torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024) > 70
			):
			torch.cuda.empty_cache()
			# if (
			# self.reset_gpu_cache
			# and (torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024) > 70
			# ):
			# torch.cuda.empty_cache()

			loss, stats, weight = retval
			stats = {k: v for k, v in stats.items() if v is not None}
			@@ -410,24 +415,14 @@
			speed_stats["backward_and_AllReaduce_time"] = f"{time4 - time3:0.3f}"

			self.train_loss_avg = (
			self.train_loss_avg * (self.step_in_epoch - 1) + loss.detach().cpu().item()
			) / self.step_in_epoch
			self.train_loss_avg * (batch_idx + kwargs.get("start_step", 0))
			+ loss.detach().cpu().item()
			) / (batch_idx + kwargs.get("start_step", 0) + 1)
			if "acc" in stats:
			self.train_acc_avg = (
			self.train_acc_avg * (self.step_in_epoch - 1)
			self.train_acc_avg * (batch_idx + kwargs.get("start_step", 0))
			+ stats["acc"].detach().cpu().item()
			) / self.step_in_epoch
			if self.use_ddp or self.use_fsdp:
			train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(
			self.device
			)
			train_acc_avg = torch.tensor(self.train_acc_avg, dtype=torch.float32).to(
			self.device
			)
			dist.all_reduce(train_loss_avg, op=dist.ReduceOp.SUM)
			dist.all_reduce(train_acc_avg, op=dist.ReduceOp.SUM)
			self.train_loss_avg = train_loss_avg.detach().cpu().item() / self.world_size
			self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size
			) / (batch_idx + kwargs.get("start_step", 0) + 1)

			# Perform an optimizer step only after accumulating enough gradients
			if (batch_idx + 1) % accum_grad == 0:
			@@ -456,6 +451,19 @@
			scheduler.step()
			# Clear gradients for the next accumulation stage
			optim.zero_grad(set_to_none=True)

			if self.use_ddp or self.use_fsdp:
			train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(
			self.device
			)
			train_acc_avg = torch.tensor(self.train_acc_avg, dtype=torch.float32).to(
			self.device
			)
			dist.all_reduce(train_loss_avg, op=dist.ReduceOp.SUM)
			dist.all_reduce(train_acc_avg, op=dist.ReduceOp.SUM)
			self.train_loss_avg = train_loss_avg.detach().cpu().item() / self.world_size
			self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size

			total_time = f"{(time.perf_counter() - time5)/accum_grad:0.3f}"
			time5 = time.perf_counter()

			@@ -473,7 +481,7 @@
			step_in_epoch=self.step_in_epoch,
			batch_num_epoch=batch_num_epoch,
			lr=lr,
			loss=loss.detach().cpu().item(),
			loss=accum_grad * loss.detach().cpu().item(),
			speed_stats=speed_stats,
			stats=stats,
			writer=writer,
			@@ -508,14 +516,14 @@
			)

			time_beg = time.perf_counter()
			else:
			if self.use_ddp or self.use_fsdp:
			iterator_stop.fill_(1)
			dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
			# else:
			# if self.use_ddp or self.use_fsdp:
			# iterator_stop.fill_(1)
			# dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)

			if self.use_ddp or self.use_fsdp:
			dist.barrier()
			iterator_stop = torch.tensor(0).to(self.device)
			# iterator_stop = torch.tensor(0).to(self.device)

			def validate_epoch(
			self,