mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 18:39:41 +08:00
[sgd] Add benchmarks (#7454)
* Init fp16 * fp16 and schedulers * scheduler linking and fp16 * to fp16 * loss scaling and documentation * more documentation * add tests, refactor config * moredocs * more docs * fix logo, add test mode, add fp16 flag * fix tests * fix scheduler * fix apex * improve safety * fix tests * fix tests * remove pin memory default * rm * fix * Update doc/examples/doc_code/raysgd_torch_signatures.py * fix * migrate changes from other PR * ok thanks * pass * signatures * lint' * Update python/ray/experimental/sgd/pytorch/utils.py * Apply suggestions from code review Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * should address most comments * comments * fix this ci * first_pass * add overrides * override * fixing up operators * format * sgd * constants * rm * revert * save * failures * fixes * trainer * run test * operator * code * op * ok done * operator * sgd test fixes * ok * trainer * format * Apply suggestions from code review Co-Authored-By: Edward Oakes <ed.nmi.oakes@gmail.com> * Update doc/source/raysgd/raysgd_pytorch.rst * docstring * dcgan * doc * commits * nit * testing * revert * Start renaming pytorch to torch * Rename PyTorchTrainer to TorchTrainer * Rename PyTorch runners to Torch runners * Finish renaming API * Rename to torch in tests * Finish renaming docs + tests * Run format + fix DeprecationWarning * fix * move tests up * benchmarks * rename * remove some args * better metrics output * fix up the benchmark * benchmark-yaml * horovod-benchmark * benchmarks * Remove benchmark code for cleanups * benchmark-code * nits * benchmark yamls * benchmark yaml * ok * ok * ok * benchmark * nit * finish_bench * makedatacreator * relax * metrics * autosetsampler * profile * movements * OK * smoothen * fix * nitdocs * loss * envflag * comments * nit * format * visible * images * move_images * fix * rernder * rrender * rest * multgpu * fix * nit * finish * extrra * setup * revert Co-authored-by: Edward Oakes <ed.nmi.oakes@gmail.com> Co-authored-by: Maksim Smolin <maximsmol@gmail.com>
This commit is contained in:
@@ -18,13 +18,15 @@ class DistributedTorchRunner(TorchRunner):
|
||||
|
||||
Args:
|
||||
args: Arguments for TorchRunner.
|
||||
backend (string): backend used by distributed PyTorch.
|
||||
backend (string): Backend used by distributed PyTorch.
|
||||
kwargs: Keyword arguments for TorchRunner.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, *args, backend="gloo", **kwargs):
|
||||
super(DistributedTorchRunner, self).__init__(*args, **kwargs)
|
||||
if backend not in ("gloo", "nccl"):
|
||||
raise ValueError("Backend must be one of 'gloo' or 'nccl'.")
|
||||
self.backend = backend
|
||||
|
||||
def setup(self, url, world_rank, world_size):
|
||||
|
||||
@@ -0,0 +1,162 @@
|
||||
Running benchmarks
|
||||
==================
|
||||
|
||||
RaySGD provides comparable or better performance than other existing solutions for parallel or distributed training.
|
||||
|
||||
You can run ``ray/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py`` for benchmarking the RaySGD TorchTrainer implementation. To benchmark training on a multi-node multi-gpu cluster, you can use the `Ray Autoscaler <https://ray.readthedocs.io/en/latest/autoscaling.html#aws>`_.
|
||||
|
||||
DISCLAIMER: RaySGD does not provide any custom communication primitives. If you see any performance issues, you may need to file them on the PyTorch github repository.
|
||||
|
||||
Single Node Results
|
||||
-------------------
|
||||
|
||||
Here are benchmarking results comparing the following:
|
||||
|
||||
* torch.nn.DataParallel
|
||||
* torch.nn.Parallel with ``apex.amp`` enabled (``O1``)
|
||||
* Ray (wrapping Pytorch DistributedDataParallel)
|
||||
* Ray (wrapping Pytorch DistributedDataParallel) with ``apex.amp`` enabled (``O1``)
|
||||
|
||||
on synthetic ImageNet data (via ``benchmark.py`` and ``dp_benchmark.py``) as of 03/04/2020.
|
||||
|
||||
Framework versions used:
|
||||
|
||||
* PyTorch Version: torch-1.4.0-cp36-cp36m
|
||||
* Torchvision Version: torchvision-0.5.0-cp36-cp36m
|
||||
* Apex Version: commit hash 5633f6d
|
||||
|
||||
.. code-block::
|
||||
|
||||
# Images per second for ResNet50
|
||||
# Batch size per worker = 128
|
||||
# GPU Type = V100
|
||||
# Run on AWS us-east-1c, p3dn.24xlarge instance.
|
||||
|
||||
|
||||
Number DataParallel Ray (PyTorch) DataParallel Ray (PyTorch)
|
||||
of GPUs + Apex + Apex
|
||||
======= ============ ============= ============ ==============
|
||||
1 2769.7 5143 2962.7 6172
|
||||
2 5492.2 9463 5886.1 10052.8
|
||||
4 10733.4 18807 11705.9 20319.5
|
||||
8 21872.5 36911.8 23317.9 38642
|
||||
|
||||
|
||||
.. image:: raysgd_multigpu_benchmark.png
|
||||
:scale: 30%
|
||||
:align: center
|
||||
|
||||
|
||||
Multi Node Results
|
||||
------------------
|
||||
|
||||
Here are benchmarking results comparing the following:
|
||||
|
||||
* Horovod
|
||||
* Horovod with ``apex.amp`` enabled (``O1``)
|
||||
* Pytorch DistributedDataParallel
|
||||
* Pytorch DistributedDataParallel with ``apex.amp`` enabled (``O1``)
|
||||
|
||||
on synthetic ImageNet data (via ``benchmark.py`` and ``horovod_benchmark_apex.py``) as of 03/04/2020.
|
||||
|
||||
|
||||
Framework versions used:
|
||||
|
||||
* PyTorch Version: torch-1.4.0-cp36-cp36m
|
||||
* Torchvision Version: torchvision-0.5.0-cp36-cp36m
|
||||
* Apex Version: commit hash 5633f6d
|
||||
* Horovod Version: horovod-0.19.0
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Images per second for ResNet50
|
||||
# Batch size per worker = 128
|
||||
# GPU Type = V100
|
||||
# Run on AWS us-east-1c, p3dn.24xlarge instances.
|
||||
|
||||
Number Horovod Ray (PyTorch) Horovod Ray (PyTorch)
|
||||
of GPUs + Apex + Apex
|
||||
======= ======= ============= ======= ==============
|
||||
1 * 8 2769.7 5143 2962.7 6172
|
||||
2 * 8 5492.2 9463 5886.1 10052.8
|
||||
4 * 8 10733.4 18807 11705.9 20319.5
|
||||
8 * 8 21872.5 36911.8 23317.9 38642
|
||||
|
||||
|
||||
.. image:: raysgd_multinode_benchmark.png
|
||||
:scale: 30%
|
||||
:align: center
|
||||
|
||||
|
||||
Simple Instructions
|
||||
-------------------
|
||||
|
||||
Note that these instructions are not maintained and may require a bit of wrangling to get working.
|
||||
|
||||
First, ``git clone https://github.com/ray-project/ray && cd ray/python/ray/util/sgd/torch/examples/``.
|
||||
|
||||
You can use ``sgd-development.yaml`` to setup your cluster configuration and ``ray up sgd-development.yaml`` to launch the cluster.
|
||||
|
||||
You can specify the number of nodes you want to use with the following configuration:
|
||||
|
||||
.. code-block::
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: <NUMBER_OF_NODES> # Change this to a custom quantity
|
||||
initial_workers: <NUMBER_OF_NODES> # same as above
|
||||
max_workers: <NUMBER_OF_NODES> # same as above
|
||||
|
||||
You may want to install FP16 support for PyTorch with the following configuration in the YAML file:
|
||||
|
||||
.. code-block:: yaml
|
||||
|
||||
setup_commands:
|
||||
- ray || pip install -U ray[rllib]
|
||||
- pip install -U ipdb torch torchvision
|
||||
# Install apex, but continue if this command fails.
|
||||
# For faster installation purposes, we do not install the apex cpp bindings
|
||||
# The cpp bindings can improve your benchmarked performance.
|
||||
- git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true
|
||||
|
||||
You should then run ``ray monitor sgd-development.yaml`` to monitor the progress of the cluster setup. When the cluster is done setting up, you should see something like the following:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
2020-03-05 01:24:53,613 INFO log_timer.py:17 -- AWSNodeProvider: Set tag ray-node-status=up-to-date on ['i-07ba946522fcb1d3d'] [LogTimer=134ms]
|
||||
2020-03-05 01:24:53,734 INFO log_timer.py:17 -- AWSNodeProvider: Set tag ray-runtime-config=c12bae3df69d4d6a207e90948dc4bf763319d7ed on ['i-07ba946522fcb1d3d'] [LogTimer=121ms]
|
||||
2020-03-05 01:24:58,475 INFO autoscaler.py:733 -- StandardAutoscaler: 7/7 target nodes (0 pending)
|
||||
2020-03-05 01:24:58,476 INFO autoscaler.py:734 -- LoadMetrics: MostDelayedHeartbeats={'172.31.38.189': 0.21588897705078125, '172.31.38.95': 0.21587467193603516, '172.31.42.196': 0.21586227416992188, '172.31.34.227': 0.2158496379852295, '172.31.42.101': 0.2158372402191162}, NodeIdleSeconds=Min=6 Mean=27 Max=40, NumNodesConnected=8, NumNodesUsed=0.0, ResourceUsage=0.0/512.0 CPU, 0.0/64.0 GPU, 0.0 GiB/4098.67 GiB memory, 0.0/1.0 node:172.31.34.227, 0.0/1.0 node:172.31.36.8, 0.0/1.0 node:172.31.36.82, 0.0/1.0 node:172.31.38.189, 0.0/1.0 node:172.31.38.95, 0.0/1.0 node:172.31.42.101, 0.0/1.0 node:172.31.42.196, 0.0/1.0 node:172.31.45.185, 0.0 GiB/5.45 GiB object_store_memory, TimeSinceLastHeartbeat=Min=0 Mean=0 Max=0
|
||||
|
||||
You can then launch a synthetic benchmark run with the following command:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
$ ray submit sgd-development.yaml benchmarks/benchmark.py --args="--batch-size 128"
|
||||
|
||||
# Or with apex fp16
|
||||
$ ray submit sgd-development.yaml benchmarks/benchmark.py --args="--batch-size 128 --use-fp16"
|
||||
|
||||
You should see something like:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
Model: resnet50
|
||||
Batch size: 128
|
||||
Number of GPUs: 16
|
||||
Iter #0: 354.2 img/sec per GPU
|
||||
Iter #1: 354.0 img/sec per GPU
|
||||
Iter #2: 353.0 img/sec per GPU
|
||||
Iter #3: 353.3 img/sec per GPU
|
||||
Iter #4: 352.8 img/sec per GPU
|
||||
Iter #5: 348.5 img/sec per GPU
|
||||
Iter #6: 352.5 img/sec per GPU
|
||||
Iter #7: 352.5 img/sec per GPU
|
||||
Iter #8: 352.1 img/sec per GPU
|
||||
Iter #9: 352.2 img/sec per GPU
|
||||
Img/sec per GPU: 352.5 +-3.0
|
||||
Total img/sec on 16 GPU(s): 5640.2 +-47.2
|
||||
|
||||
|
||||
You can run ``ray up benchmarks/horovod-benchmark.yaml`` to launch an AWS cluster that sets up Horovod on each machine.
|
||||
See ``https://github.com/horovod/horovod`` for launching Horovod training. ``horovod_benchmark_apex.py`` can be used with ``horovodrun`` to obtain benchmarking results.
|
||||
@@ -0,0 +1,126 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import torch.utils.data.distributed
|
||||
from torchvision import models
|
||||
import timeit
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.util.sgd import TorchTrainer
|
||||
from ray.util.sgd.torch import TrainingOperator
|
||||
|
||||
# Benchmark settings
|
||||
parser = argparse.ArgumentParser(
|
||||
description="PyTorch Synthetic Benchmark",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--fp16", action="store_true", default=False, help="use fp16 training")
|
||||
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="resnet50", help="model to benchmark")
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=32, help="input batch size")
|
||||
|
||||
parser.add_argument(
|
||||
"--num-warmup-batches",
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of warm-up batches that don't count towards benchmark")
|
||||
parser.add_argument(
|
||||
"--num-batches-per-iter",
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of batches per benchmark iteration")
|
||||
parser.add_argument(
|
||||
"--num-iters", type=int, default=10, help="number of benchmark iterations")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-cuda",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Disables CUDA training")
|
||||
parser.add_argument(
|
||||
"--local",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Disables cluster training")
|
||||
|
||||
args = parser.parse_args()
|
||||
args.cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
device = "GPU" if args.cuda else "CPU"
|
||||
|
||||
|
||||
def init_hook():
|
||||
import torch.backends.cudnn as cudnn
|
||||
cudnn.benchmark = True
|
||||
|
||||
|
||||
class Training(TrainingOperator):
|
||||
def setup(self, config):
|
||||
data = torch.randn(args.batch_size, 3, 224, 224)
|
||||
target = torch.LongTensor(args.batch_size).random_() % 1000
|
||||
if args.cuda:
|
||||
data, target = data.cuda(), target.cuda()
|
||||
|
||||
self.data, self.target = data, target
|
||||
|
||||
def train_epoch(self, *pargs, **kwargs):
|
||||
# print(self.model)
|
||||
def benchmark():
|
||||
self.optimizer.zero_grad()
|
||||
output = self.model(self.data)
|
||||
loss = F.cross_entropy(output, self.target)
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
|
||||
# print("Running warmup...")
|
||||
if self.global_step == 0:
|
||||
timeit.timeit(benchmark, number=args.num_warmup_batches)
|
||||
self.global_step += 1
|
||||
# print("Running benchmark...")
|
||||
time = timeit.timeit(benchmark, number=args.num_batches_per_iter)
|
||||
img_sec = args.batch_size * args.num_batches_per_iter / time
|
||||
return {"img_sec": img_sec}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init(address=None if args.local else "auto")
|
||||
num_workers = 2 if args.local else int(ray.cluster_resources().get(device))
|
||||
from ray.util.sgd.torch.examples.train_example import LinearDataset
|
||||
|
||||
print("Model: %s" % args.model)
|
||||
print("Batch size: %d" % args.batch_size)
|
||||
print("Number of %ss: %d" % (device, num_workers))
|
||||
|
||||
trainer = TorchTrainer(
|
||||
model_creator=lambda cfg: getattr(models, args.model)(),
|
||||
optimizer_creator=lambda model, cfg: optim.SGD(
|
||||
model.parameters(), lr=0.01 * cfg.get("lr_scaler")),
|
||||
data_creator=lambda cfg: LinearDataset(4, 2),
|
||||
initialization_hook=init_hook,
|
||||
config=dict(
|
||||
lr_scaler=num_workers),
|
||||
training_operator_cls=Training,
|
||||
num_workers=num_workers,
|
||||
use_gpu=args.cuda,
|
||||
use_fp16=args.fp16,
|
||||
)
|
||||
|
||||
img_secs = []
|
||||
for x in range(args.num_iters):
|
||||
result = trainer.train()
|
||||
# print(result)
|
||||
img_sec = result["img_sec"]
|
||||
print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device))
|
||||
img_secs.append(img_sec)
|
||||
|
||||
# Results
|
||||
img_sec_mean = np.mean(img_secs)
|
||||
img_sec_conf = 1.96 * np.std(img_secs)
|
||||
print("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf))
|
||||
print("Total img/sec on %d %s(s): %.1f +-%.1f" %
|
||||
(num_workers, device, num_workers * img_sec_mean,
|
||||
num_workers * img_sec_conf))
|
||||
@@ -0,0 +1,106 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import timeit
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import torch.utils.data.distributed
|
||||
from torch.nn import DataParallel
|
||||
from torchvision import models
|
||||
import numpy as np
|
||||
import os
|
||||
# Apex
|
||||
from apex import amp
|
||||
|
||||
# Benchmark settings
|
||||
parser = argparse.ArgumentParser(
|
||||
description="PyTorch DP Synthetic Benchmark",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--fp16-allreduce",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="use fp16 compression during allreduce")
|
||||
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="resnet50", help="model to benchmark")
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=32, help="input batch size")
|
||||
parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus")
|
||||
|
||||
parser.add_argument(
|
||||
"--num-warmup-batches",
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of warm-up batches that don\"t count towards benchmark")
|
||||
parser.add_argument(
|
||||
"--num-batches-per-iter",
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of batches per benchmark iteration")
|
||||
parser.add_argument(
|
||||
"--num-iters", type=int, default=10, help="number of benchmark iterations")
|
||||
parser.add_argument(
|
||||
"--amp-fp16",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enables FP16 training with Apex.")
|
||||
|
||||
args = parser.parse_args()
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(
|
||||
str(i) for i in range(args.num_gpus))
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
# Set up standard model.
|
||||
model = getattr(models, args.model)().cuda()
|
||||
model = DataParallel(model)
|
||||
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.01)
|
||||
|
||||
# Apex
|
||||
if args.amp_fp16:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
|
||||
|
||||
# Set up fixed fake data
|
||||
data = torch.randn(args.batch_size, 3, 224, 224)
|
||||
target = torch.LongTensor(args.batch_size).random_() % 1000
|
||||
data, target = data.cuda(), target.cuda()
|
||||
|
||||
|
||||
def benchmark_step():
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.cross_entropy(output, target)
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
print("Model: %s" % args.model)
|
||||
print("Batch size: %d" % args.batch_size)
|
||||
device = "GPU"
|
||||
print("Number of %ss: %d" % (device, args.num_gpus))
|
||||
|
||||
# Warm-up
|
||||
print("Running warmup...")
|
||||
timeit.timeit(benchmark_step, number=args.num_warmup_batches)
|
||||
|
||||
# Benchmark
|
||||
print("Running benchmark...")
|
||||
img_secs = []
|
||||
for x in range(args.num_iters):
|
||||
time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
|
||||
img_sec = args.batch_size * args.num_batches_per_iter / time
|
||||
print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device))
|
||||
img_secs.append(img_sec)
|
||||
|
||||
# Results
|
||||
img_sec_mean = np.mean(img_secs)
|
||||
img_sec_conf = 1.96 * np.std(img_secs)
|
||||
print("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf))
|
||||
print("Total img/sec on %d %s(s): %.1f +-%.1f" % (
|
||||
args.num_gpus,
|
||||
device,
|
||||
img_sec_mean, # we do NOT scale this by number workers
|
||||
args.num_gpus * img_sec_conf))
|
||||
@@ -0,0 +1,85 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: horovod-pytorch
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 1
|
||||
initial_workers: 1
|
||||
max_workers: 1
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 50
|
||||
# docker:
|
||||
# image: tensorflow/tensorflow:1.5.0-py3
|
||||
# container_name: ray_docker
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-east-1
|
||||
availability_zone: us-east-1c
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
|
||||
|
||||
head_node:
|
||||
InstanceType: p3dn.24xlarge
|
||||
ImageId: ami-0698bcaf8bd9ef56d
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 250
|
||||
# SpotOptions:
|
||||
# MaxPrice: "9.0"
|
||||
|
||||
|
||||
worker_nodes:
|
||||
InstanceType: p3dn.24xlarge
|
||||
ImageId: ami-0698bcaf8bd9ef56d
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 250
|
||||
# SpotOptions:
|
||||
# MaxPrice: "9.0"
|
||||
# # Run workers on spot by default. Comment this out to use on-demand.
|
||||
# InstanceMarketOptions:
|
||||
# MarketType: spot
|
||||
|
||||
setup_commands:
|
||||
- pip install torch torchvision ipdb
|
||||
- pip install ray[rllib] # enable autoscaling
|
||||
- git clone https://github.com/horovod/horovod || true
|
||||
- git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true
|
||||
- tmux new -d -s my-session "HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install horovod"
|
||||
|
||||
|
||||
file_mounts: {}
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- cat ~/ray_bootstrap_key.pem > ~/.ssh/id_rsa
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands:
|
||||
- pip install horovod
|
||||
|
||||
# # Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=1000000000
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
# - nvidia-docker run -it --network=host -d --rm -p 4321:22 horovod:latest bash -c "pip install Pillow==6.1; sleep infinity"
|
||||
|
||||
@@ -0,0 +1,144 @@
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import torch.backends.cudnn as cudnn
|
||||
import torch.nn.functional as F
|
||||
import torch.optim as optim
|
||||
import torch.utils.data.distributed
|
||||
from torchvision import models
|
||||
import horovod.torch as hvd
|
||||
import timeit
|
||||
import numpy as np
|
||||
# Apex
|
||||
from apex import amp
|
||||
|
||||
# Benchmark settings
|
||||
parser = argparse.ArgumentParser(
|
||||
description="PyTorch Synthetic Benchmark",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
||||
parser.add_argument(
|
||||
"--fp16-allreduce",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="use fp16 compression during allreduce")
|
||||
|
||||
parser.add_argument(
|
||||
"--model", type=str, default="resnet50", help="model to benchmark")
|
||||
parser.add_argument(
|
||||
"--batch-size", type=int, default=32, help="input batch size")
|
||||
|
||||
parser.add_argument(
|
||||
"--num-warmup-batches",
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of warm-up batches that don\"t count towards benchmark")
|
||||
parser.add_argument(
|
||||
"--num-batches-per-iter",
|
||||
type=int,
|
||||
default=10,
|
||||
help="number of batches per benchmark iteration")
|
||||
parser.add_argument(
|
||||
"--num-iters", type=int, default=10, help="number of benchmark iterations")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-cuda",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="disables CUDA training")
|
||||
parser.add_argument(
|
||||
"--amp-fp16",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Enables FP16 training with Apex.")
|
||||
|
||||
args = parser.parse_args()
|
||||
args.cuda = not args.no_cuda and torch.cuda.is_available()
|
||||
|
||||
hvd.init()
|
||||
|
||||
if args.cuda:
|
||||
# Horovod: pin GPU to local rank.
|
||||
torch.cuda.set_device(hvd.local_rank())
|
||||
|
||||
cudnn.benchmark = True
|
||||
|
||||
# Set up standard model.
|
||||
model = getattr(models, args.model)()
|
||||
|
||||
if args.cuda:
|
||||
# Move model to GPU.
|
||||
model.cuda()
|
||||
|
||||
optimizer = optim.SGD(model.parameters(), lr=0.01)
|
||||
|
||||
# Horovod: (optional) compression algorithm.
|
||||
compression = (hvd.Compression.fp16
|
||||
if args.fp16_allreduce else hvd.Compression.none)
|
||||
|
||||
# Horovod: wrap optimizer with DistributedOptimizer.
|
||||
optimizer = hvd.DistributedOptimizer(
|
||||
optimizer,
|
||||
named_parameters=model.named_parameters(),
|
||||
compression=compression)
|
||||
|
||||
# Horovod: broadcast parameters & optimizer state.
|
||||
hvd.broadcast_parameters(model.state_dict(), root_rank=0)
|
||||
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
|
||||
|
||||
# Apex
|
||||
if args.amp_fp16:
|
||||
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
|
||||
|
||||
# Set up fixed fake data
|
||||
data = torch.randn(args.batch_size, 3, 224, 224)
|
||||
target = torch.LongTensor(args.batch_size).random_() % 1000
|
||||
if args.cuda:
|
||||
data, target = data.cuda(), target.cuda()
|
||||
|
||||
|
||||
def benchmark_step():
|
||||
optimizer.zero_grad()
|
||||
output = model(data)
|
||||
loss = F.cross_entropy(output, target)
|
||||
# Apex
|
||||
if args.amp_fp16:
|
||||
with amp.scale_loss(loss, optimizer) as scaled_loss:
|
||||
scaled_loss.backward()
|
||||
optimizer.synchronize()
|
||||
with optimizer.skip_synchronize():
|
||||
optimizer.step()
|
||||
else:
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
|
||||
|
||||
def log(s, nl=True):
|
||||
if hvd.rank() != 0:
|
||||
return
|
||||
print(s, end="\n" if nl else "")
|
||||
|
||||
|
||||
log("Model: %s" % args.model)
|
||||
log("Batch size: %d" % args.batch_size)
|
||||
device = "GPU" if args.cuda else "CPU"
|
||||
log("Number of %ss: %d" % (device, hvd.size()))
|
||||
|
||||
# Warm-up
|
||||
log("Running warmup...")
|
||||
timeit.timeit(benchmark_step, number=args.num_warmup_batches)
|
||||
|
||||
# Benchmark
|
||||
log("Running benchmark...")
|
||||
img_secs = []
|
||||
for x in range(args.num_iters):
|
||||
time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter)
|
||||
img_sec = args.batch_size * args.num_batches_per_iter / time
|
||||
log("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device))
|
||||
img_secs.append(img_sec)
|
||||
|
||||
# Results
|
||||
img_sec_mean = np.mean(img_secs)
|
||||
img_sec_conf = 1.96 * np.std(img_secs)
|
||||
log("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf))
|
||||
log("Total img/sec on %d %s(s): %.1f +-%.1f" %
|
||||
(hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf))
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 15 KiB |
@@ -0,0 +1,94 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: sgd-pytorch
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers. min_workers default to 0.
|
||||
min_workers: 0
|
||||
initial_workers: 0
|
||||
max_workers: 0
|
||||
|
||||
target_utilization_fraction: 0.9
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 10
|
||||
# docker:
|
||||
# image: tensorflow/tensorflow:1.5.0-py3
|
||||
# container_name: ray_docker
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-east-1
|
||||
availability_zone: us-east-1c
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# ssh_private_key: ...
|
||||
|
||||
head_node:
|
||||
InstanceType: p3dn.24xlarge
|
||||
ImageId: ami-0698bcaf8bd9ef56d
|
||||
# KeyName: ...
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 300
|
||||
# SpotOptions:
|
||||
# MaxPrice: "9.0"
|
||||
|
||||
|
||||
worker_nodes:
|
||||
InstanceType: p3.16xlarge
|
||||
ImageId: ami-0698bcaf8bd9ef56d
|
||||
# KeyName: ...
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 300
|
||||
# SpotOptions:
|
||||
# MaxPrice: "9.0"
|
||||
# # Run workers on spot by default. Comment this out to use on-demand.
|
||||
# InstanceMarketOptions:
|
||||
# MarketType: spot
|
||||
|
||||
setup_commands:
|
||||
# This replaces the standard anaconda Ray installation
|
||||
- ray || pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl
|
||||
# Uncomment this and the filemount to update the Ray installation with your local Ray code
|
||||
# - rm -rf ./anaconda3/lib/python3.6/site-packages/ray/util/sgd/
|
||||
# - cp -rf ~/sgd ./anaconda3/lib/python3.6/site-packages/ray/util/
|
||||
|
||||
# Installing this without -U to make sure we don't replace the existing Ray installation
|
||||
- pip install ray[rllib]
|
||||
- pip install -U ipdb torch torchvision
|
||||
# Install Apex
|
||||
- rm -rf apex || true
|
||||
- git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true
|
||||
|
||||
|
||||
file_mounts: {
|
||||
# This should point to ray/python/ray/util/sgd.
|
||||
# ~/sgd: ../../../sgd,
|
||||
}
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands: []
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# # Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=1000000000
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --object-store-memory=1000000000
|
||||
|
||||
@@ -114,7 +114,7 @@ class TorchRunner:
|
||||
else:
|
||||
self.criterion = self.loss_creator(self.config)
|
||||
|
||||
if torch.cuda.is_available() and hasattr("cuda", self.criterion):
|
||||
if torch.cuda.is_available() and hasattr(self.criterion, "cuda"):
|
||||
self.criterion = self.criterion.cuda()
|
||||
|
||||
def _create_schedulers_if_available(self):
|
||||
|
||||
@@ -525,7 +525,6 @@ class TorchTrainer:
|
||||
return
|
||||
else:
|
||||
delay = 2**i
|
||||
logger.info("Resources: {}".format(resources))
|
||||
logger.warning(
|
||||
"No new workers found. Retrying in %d sec." % delay)
|
||||
time.sleep(delay)
|
||||
@@ -562,7 +561,6 @@ class TorchTrainable(Trainable):
|
||||
validation_stats = self._trainer.validate()
|
||||
|
||||
train_stats.update(validation_stats)
|
||||
# output {"mean_loss": test_loss, "mean_accuracy": accuracy}
|
||||
return train_stats
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
|
||||
Reference in New Issue
Block a user