| | |
| | | # https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group |
| | | os.environ.setdefault("NCCL_BLOCKING_WAIT", "1") |
| | | |
| | | torch.distributed.init_process_group(backend='nccl', |
| | | torch.distributed.init_process_group(backend=self.dist_backend, |
| | | init_method=self.dist_init_method, |
| | | world_size=args.dist_world_size, |
| | | rank=args.dist_rank) |
| | | self.dist_rank = torch.distributed.get_rank() |
| | | self.dist_world_size = torch.distributed.get_world_size() |
| | | self.local_rank = args.local_rank |
| | | logging.info("world size: {}, rank: {}, local_rank: {}".format(self.dist_world_size, self.dist_rank, |
| | | self.local_rank)) |
| | | |
| | | def init_options_pai(self): |
| | | if self.distributed: |
| | |
| | | # https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group |
| | | os.environ.setdefault("NCCL_BLOCKING_WAIT", "1") |
| | | |
| | | torch.distributed.init_process_group(backend='nccl', init_method='env://') |
| | | torch.distributed.init_process_group(backend=self.dist_backend, init_method='env://') |
| | | self.dist_rank = torch.distributed.get_rank() |
| | | self.dist_world_size = torch.distributed.get_world_size() |
| | | self.local_rank = args.local_rank |
| | | logging.info("world size: {}, rank: {}, local_rank: {}".format(self.dist_world_size, self.dist_rank, |
| | | self.local_rank)) |
| | | |
| | | |
| | | def resolve_distributed_mode(args): |