| | |
| | | echo "stage 4: ASR Training" |
| | | |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt" |
| | | current_time=$(date "+%Y-%m-%d_%H-%M") |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}" |
| | | echo "log_file: ${log_file}" |
| | | |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | |
| | | echo "stage 4: ASR Training" |
| | | |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt" |
| | | current_time=$(date "+%Y-%m-%d_%H-%M") |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}" |
| | | echo "log_file: ${log_file}" |
| | | |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | |
| | | echo "stage 4: ASR Training" |
| | | |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt" |
| | | current_time=$(date "+%Y-%m-%d_%H-%M") |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}" |
| | | echo "log_file: ${log_file}" |
| | | |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | |
| | | echo "stage 4: ASR Training" |
| | | |
| | | mkdir -p ${exp_dir}/exp/${model_dir} |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt" |
| | | current_time=$(date "+%Y-%m-%d_%H-%M") |
| | | log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}" |
| | | echo "log_file: ${log_file}" |
| | | |
| | | gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') |
| | |
| | | epoch (int): The current epoch number. |
| | | """ |
| | | self.model.train() |
| | | pbar = tqdm(colour="blue", desc=f"Training Epoch: {epoch + 1}", total=len(self.dataloader_train), |
| | | pbar = tqdm(colour="blue", desc=f"rank: {self.local_rank}, Training Epoch: {epoch + 1}", total=len(self.dataloader_train), |
| | | dynamic_ncols=True) |
| | | |
| | | # Set the number of steps for gradient accumulation |
| | |
| | | f"epoch: {epoch}/{self.max_epoch}, " |
| | | f"step: {batch_idx}/{len(self.dataloader_train)}, total: {self.batch_total}, " |
| | | f"(loss: {loss.detach().cpu().item():.3f}), " |
| | | f"{[(k, round(v.cpu().item(), 3)) for k, v in stats.items()]}" |
| | | f"{[(k, round(v.cpu().item(), 3)) for k, v in stats.items()]}, " |
| | | f"{speed_stats}, " |
| | | f"{gpu_info}" |
| | | ) |
| | |
| | | """ |
| | | self.model.eval() |
| | | with torch.no_grad(): |
| | | pbar = tqdm(colour="red", desc=f"Training Epoch: {epoch + 1}", total=len(self.dataloader_val), |
| | | pbar = tqdm(colour="red", desc=f"rank: {self.local_rank}, Validation Epoch: {epoch + 1}", total=len(self.dataloader_val), |
| | | dynamic_ncols=True) |
| | | speed_stats = {} |
| | | time5 = time.perf_counter() |
| | |
| | | f"validation epoch: {epoch}/{self.max_epoch}, " |
| | | f"step: {batch_idx}/{len(self.dataloader_val)}, " |
| | | f"(loss: {loss.detach().cpu().item():.3f}), " |
| | | f"{[(k, round(v.cpu().item(), 3)) for k, v in stats.items()]}" |
| | | f"{[(k, round(v.cpu().item(), 3)) for k, v in stats.items()]}, " |
| | | f"{speed_stats}, " |
| | | ) |
| | | pbar.set_description(description) |