pytorch使用horovod多gpu训练的实现
程序员文章站
2022-04-25 16:46:40
pytorch在horovod上训练步骤分为以下几步:import torchimport horovod.torch as hvd# initialize horovod 初始化horovodhvd...
pytorch在horovod上训练步骤分为以下几步:
import torch import horovod.torch as hvd # initialize horovod 初始化horovod hvd.init() # pin gpu to be used to process local rank (one gpu per process) 分配到每个gpu上 torch.cuda.set_device(hvd.local_rank()) # define dataset... 定义dataset train_dataset = ... # partition dataset among workers using distributedsampler 对dataset的采样器进行调整,使用torch.utils.data.distributed.distributedsampler train_sampler = torch.utils.data.distributed.distributedsampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.dataloader(train_dataset, batch_size=..., sampler=train_sampler) # build model... model = ... model.cuda() optimizer = optim.sgd(model.parameters()) # add horovod distributed optimizer 使用horovod的分布式优化器函数包裹在原先optimizer上 optimizer = hvd.distributedoptimizer(optimizer, named_parameters=model.named_parameters()) # broadcast parameters from rank 0 to all other processes. 参数广播到每个gpu上 hvd.broadcast_parameters(model.state_dict(), root_rank=0) for epoch in range(100): for batch_idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() output = model(data) loss = f.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: print('train epoch: {} [{}/{}]\tloss: {}'.format( epoch, batch_idx * len(data), len(train_sampler), loss.item()))
完整示例代码如下,在imagenet上采用resnet50进行训练
from __future__ import print_function import torch import argparse import torch.backends.cudnn as cudnn import torch.nn.functional as f import torch.optim as optim import torch.utils.data.distributed from torchvision import datasets, transforms, models import horovod.torch as hvd import os import math from tqdm import tqdm from distutils.version import looseversion # training settings parser = argparse.argumentparser(description='pytorch imagenet example', formatter_class=argparse.argumentdefaultshelpformatter) parser.add_argument('--train-dir', default=os.path.expanduser('~/imagenet/train'), help='path to training data') parser.add_argument('--val-dir', default=os.path.expanduser('~/imagenet/validation'), help='path to validation data') parser.add_argument('--log-dir', default='./logs', help='tensorboard log directory') parser.add_argument('--checkpoint-format', default='./checkpoint-{epoch}.pth.tar', help='checkpoint file format') parser.add_argument('--fp-allreduce', action='store_true', default=false, help='use fp compression during allreduce') parser.add_argument('--batches-per-allreduce', type=int, default=, help='number of batches processed locally before ' 'executing allreduce across workers; it multiplies ' 'total batch size.') parser.add_argument('--use-adasum', action='store_true', default=false, help='use adasum algorithm to do reduction') # default settings from https://arxiv.org/abs/1706.02677. parser.add_argument('--batch-size', type=int, default=32, help='input batch size for training') parser.add_argument('--val-batch-size', type=int, default=32, help='input batch size for validation') parser.add_argument('--epochs', type=int, default=90, help='number of epochs to train') parser.add_argument('--base-lr', type=float, default=0.0125, 44 help='learning rate for a single gpu') 45 parser.add_argument('--warmup-epochs', type=float, default=5, help='number of warmup epochs') parser.add_argument('--momentum', type=float, default=0.9, help='sgd momentum') parser.add_argument('--wd', type=float, default=0.00005, help='weight decay') parser.add_argument('--no-cuda', action='store_true', default=false, help='disables cuda training') parser.add_argument('--seed', type=int, default=42, help='random seed') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() allreduce_batch_size = args.batch_size * args.batches_per_allreduce hvd.init() torch.manual_seed(args.seed) if args.cuda: # horovod: pin gpu to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) cudnn.benchmark = true # if set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, name='resume_from_epoch').item() # horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # horovod: write tensorboard logs on first worker. try: if looseversion(torch.__version__) >= looseversion('1.2.0'): from torch.utils.tensorboard import summarywriter else: from tensorboardx import summarywriter log_writer = summarywriter(args.log_dir) if hvd.rank() == 0 else none except importerror: log_writer = none # horovod: limit # of cpu threads to be used per worker. torch.set_num_threads(4) kwargs = {'num_workers': 4, 'pin_memory': true} if args.cuda else {} train_dataset = \ datasets.imagefolder(args.train_dir, transform=transforms.compose([ transforms.randomresizedcrop(224), transforms.randomhorizontalflip(), transforms.totensor(), transforms.normalize(mean=[., ., .], std=[0.229, 0.224, 0.225]) ])) # horovod: use distributedsampler to partition data among workers. manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.distributedsampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.dataloader( train_dataset, batch_size=allreduce_batch_size, sampler=train_sampler, **kwargs) val_dataset = \ datasets.imagefolder(args.val_dir, transform=transforms.compose([ transforms.resize(256), transforms.centercrop(224), transforms.totensor(), transforms.normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) ])) val_sampler = torch.utils.data.distributed.distributedsampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_loader = torch.utils.data.dataloader(val_dataset, batch_size=args.val_batch_size, sampler=val_sampler, **kwargs) # set up standard resnet-50 model. model = models.resnet50() # by default, adasum doesn't need scaling up learning rate. # for sum/average with gradient accumulation: scale learning rate by batches_per_allreduce lr_scaler = args.batches_per_allreduce * hvd.size() if not args.use_adasum else 1 if args.cuda: # move model to gpu. model.cuda() # if using gpu adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = args.batches_per_allreduce * hvd.local_size() # horovod: scale learning rate by the number of gpus. optimizer = optim.sgd(model.parameters(), lr=(args.base_lr * lr_scaler), momentum=args.momentum, weight_decay=args.wd) # horovod: (optional) compression algorithm. compression = hvd.compression.fp16 if args.fp16_allreduce else hvd.compression.none # horovod: wrap optimizer with distributedoptimizer. optimizer = hvd.distributedoptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, backward_passes_per_step=args.batches_per_allreduce, op=hvd.adasum if args.use_adasum else hvd.average) # restore from a previous checkpoint, if initial_epoch is specified. # horovod: restore on the first worker which will broadcast weights to other workers. if resume_from_epoch > 0 and hvd.rank() == 0: filepath = args.checkpoint_format.format(epoch=resume_from_epoch) checkpoint = torch.load(filepath) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) # horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=) hvd.broadcast_optimizer_state(optimizer, root_rank=) def train(epoch): model.train() train_sampler.set_epoch(epoch) train_loss = metric('train_loss') train_accuracy = metric('train_accuracy') with tqdm(total=len(train_loader), desc='train epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): adjust_learning_rate(epoch, batch_idx) if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() # split data into sub-batches of size batch_size for i in range(0, len(data), args.batch_size): data_batch = data[i:i + args.batch_size] target_batch = target[i:i + args.batch_size] output = model(data_batch) train_accuracy.update(accuracy(output, target_batch)) loss = f.cross_entropy(output, target_batch) train_loss.update(loss) # average gradients among sub-batches loss.div_(math.ceil(float(len(data)) / args.batch_size)) loss.backward() # gradient is applied across all ranks optimizer.step() t.set_postfix({'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item()}) t.update(1) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch) def validate(epoch): model.eval() val_loss = metric('val_loss') val_accuracy = metric('val_accuracy') with tqdm(total=len(val_loader), desc='validate epoch #{}'.format(epoch + ), disable=not verbose) as t: with torch.no_grad(): for data, target in val_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) val_loss.update(f.cross_entropy(output, target)) val_accuracy.update(accuracy(output, target)) t.set_postfix({'loss': val_loss.avg.item(), 'accuracy': 100. * val_accuracy.avg.item()}) t.update(1) if log_writer: log_writer.add_scalar('val/loss', val_loss.avg, epoch) log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch) # horovod: using `lr = base_lr * hvd.size()` from the very beginning leads to worse final # accuracy. scale the learning rate `lr = base_lr` ---> `lr = base_lr * hvd.size()` during # the first five epochs. see https://arxiv.org/abs/1706.02677 for details. # after the warmup reduce learning rate by 10 on the 30th, 60th and 80th epochs. def adjust_learning_rate(epoch, batch_idx): if epoch < args.warmup_epochs: epoch += float(batch_idx + 1) / len(train_loader) lr_adj = 1. / hvd.size() * (epoch * (hvd.size() - 1) / args.warmup_epochs + 1) elif epoch < 30: lr_adj = 1. elif epoch < 60: lr_adj = 1e-1 elif epoch < 80: lr_adj = 1e-2 else: lr_adj = 1e-3 for param_group in optimizer.param_groups: param_group['lr'] = args.base_lr * hvd.size() * args.batches_per_allreduce * lr_adj def accuracy(output, target): # get the index of the max log-probability pred = output.max(1, keepdim=true)[1] return pred.eq(target.view_as(pred)).cpu().float().mean() def save_checkpoint(epoch): if hvd.rank() == 0: filepath = args.checkpoint_format.format(epoch=epoch + 1) state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(state, filepath) # horovod: average metrics from distributed training. class metric(object): def __init__(self, name): self.name = name self.sum = torch.tensor(0.) self.n = torch.tensor(0.) def update(self, val): self.sum += hvd.allreduce(val.detach().cpu(), name=self.name) self.n += 1 @property def avg(self): return self.sum / self.n for epoch in range(resume_from_epoch, args.epochs): train(epoch) validate(epoch) save_checkpoint(epoch)
到此这篇关于pytorch使用horovod多gpu训练的实现的文章就介绍到这了,更多相关pytorch horovod多gpu训练内容请搜索以前的文章或继续浏览下面的相关文章希望大家以后多多支持!
上一篇: 浅谈BlueHost主机CDN加速原理