python/FunASR-XL.git

			@@ -198,6 +198,8 @@
			for k in dst_state.keys():
			if not k.startswith("module.") and "module."+k in src_state.keys():
			k_ddp = "module."+k
			elif k.startswith("module.") and "module."+k not in src_state.keys():
			k_ddp = k.replace("module.", "", 1)
			else:
			k_ddp = k
			if k_ddp in src_state.keys():
			@@ -246,8 +248,15 @@
			optim.zero_grad()
			speed_stats = {}
			time5 = time.perf_counter()

			iterator_stop = torch.tensor(0).to(self.device)
			dist.barrier()
			print(f"before iter, iterator_stop: {iterator_stop}\n")
			dataloader_train.batch_sampler.set_epoch(epoch)
			for batch_idx, batch in enumerate(dataloader_train):
			if self.use_ddp or self.use_fsdp:
			dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
			if iterator_stop > 0:
			break
			self.batch_total += 1
			time1 = time.perf_counter()
			speed_stats["data_load"] = f"{time1-time5:0.3f}"
			@@ -288,13 +297,13 @@
			self.train_loss_avg = (self.train_loss_avg*batch_idx + loss.detach().cpu().item())/(batch_idx+1)
			if "acc" in stats:
			self.train_acc_avg = (self.train_acc_avg * batch_idx + stats["acc"].detach().cpu().item()) / (batch_idx + 1)
			if self.use_ddp or self.use_fsdp:
			train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(self.device)
			train_acc_avg = torch.tensor(self.train_acc_avg, dtype=torch.float32).to(self.device)
			dist.all_reduce(train_loss_avg, op=dist.ReduceOp.SUM)
			dist.all_reduce(train_acc_avg, op=dist.ReduceOp.SUM)
			self.train_loss_avg = train_loss_avg.detach().cpu().item() / self.world_size
			self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size
			# if self.use_ddp or self.use_fsdp:
			# train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(self.device)
			# train_acc_avg = torch.tensor(self.train_acc_avg, dtype=torch.float32).to(self.device)
			# dist.all_reduce(train_loss_avg, op=dist.ReduceOp.SUM)
			# dist.all_reduce(train_acc_avg, op=dist.ReduceOp.SUM)
			# self.train_loss_avg = train_loss_avg.detach().cpu().item() / self.world_size
			# self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size


			# Perform an optimizer step only after accumulating enough gradients
			@@ -354,7 +363,11 @@
			if (batch_idx+1) % self.save_checkpoint_interval == 0:
			self.save_checkpoint(epoch, model=model, optim=optim, scheduler=scheduler, scaler=scaler, step=batch_idx+1)


			else:
			if self.use_ddp or self.use_fsdp:
			iterator_stop.fill_(1)
			dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)

			if self.use_ddp or self.use_fsdp:
			dist.barrier()

			@@ -381,7 +394,16 @@

			speed_stats = {}
			time5 = time.perf_counter()
			iterator_stop = torch.tensor(0).to(self.device)
			dist.barrier()
			print(f"before iter, iterator_stop: {iterator_stop}\n")
			for batch_idx, batch in enumerate(dataloader_val):
			if self.use_ddp or self.use_fsdp:
			dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
			if epoch >= 1:
			print(f"iterator_stop: {iterator_stop}\n")
			if iterator_stop > 0:
			break
			time1 = time.perf_counter()
			speed_stats["data_load"] = f"{time1 - time5:0.3f}"
			batch = to_device(batch, self.device)
			@@ -410,13 +432,13 @@
			self.val_loss_avg = (self.val_loss_avg*batch_idx + loss.detach().cpu().item())/(batch_idx+1)
			if "acc" in stats:
			self.val_acc_avg = (self.val_acc_avg * batch_idx + stats["acc"].detach().cpu().item()) / (batch_idx + 1)
			if self.use_ddp or self.use_fsdp:
			val_loss_avg = torch.tensor(self.val_loss_avg, dtype=torch.float32).to(self.device)
			val_acc_avg = torch.tensor(self.val_acc_avg, dtype=torch.float32).to(self.device)
			dist.all_reduce(val_loss_avg, op=dist.ReduceOp.SUM)
			dist.all_reduce(val_acc_avg, op=dist.ReduceOp.SUM)
			self.val_loss_avg = val_loss_avg.detach().cpu().item() / self.world_size
			self.val_acc_avg = val_acc_avg.detach().cpu().item() / self.world_size
			# if self.use_ddp or self.use_fsdp:
			# val_loss_avg = torch.tensor(self.val_loss_avg, dtype=torch.float32).to(self.device)
			# val_acc_avg = torch.tensor(self.val_acc_avg, dtype=torch.float32).to(self.device)
			# dist.all_reduce(val_loss_avg, op=dist.ReduceOp.SUM)
			# dist.all_reduce(val_acc_avg, op=dist.ReduceOp.SUM)
			# self.val_loss_avg = val_loss_avg.detach().cpu().item() / self.world_size
			# self.val_acc_avg = val_acc_avg.detach().cpu().item() / self.world_size

			batch_num_epoch = -1
			if hasattr(dataloader_val, "__len__"):
			@@ -431,9 +453,16 @@
			tag="val",
			)

			else:
			if self.use_ddp or self.use_fsdp:
			iterator_stop.fill_(1)
			dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)

			self.val_acc_list.append(self.val_acc_avg)
			model.train()




			if self.use_ddp or self.use_fsdp:
			dist.barrier()