diff --git a/doc/source/raysgd/raysgd_pytorch.rst b/doc/source/raysgd/raysgd_pytorch.rst index d6865a8fb..a7cc457aa 100644 --- a/doc/source/raysgd/raysgd_pytorch.rst +++ b/doc/source/raysgd/raysgd_pytorch.rst @@ -525,6 +525,53 @@ You can see the `DCGAN script `_. + +DISCLAIMER: RaySGD does not provide any custom communication primitives. If you see any performance issues, you may need to file them on the PyTorch github repository. + Feature Requests ---------------- diff --git a/python/ray/util/sgd/torch/distributed_torch_runner.py b/python/ray/util/sgd/torch/distributed_torch_runner.py index 58b01c48f..5c588ed80 100644 --- a/python/ray/util/sgd/torch/distributed_torch_runner.py +++ b/python/ray/util/sgd/torch/distributed_torch_runner.py @@ -18,13 +18,15 @@ class DistributedTorchRunner(TorchRunner): Args: args: Arguments for TorchRunner. - backend (string): backend used by distributed PyTorch. + backend (string): Backend used by distributed PyTorch. kwargs: Keyword arguments for TorchRunner. """ def __init__(self, *args, backend="gloo", **kwargs): super(DistributedTorchRunner, self).__init__(*args, **kwargs) + if backend not in ("gloo", "nccl"): + raise ValueError("Backend must be one of 'gloo' or 'nccl'.") self.backend = backend def setup(self, url, world_rank, world_size): diff --git a/python/ray/util/sgd/torch/examples/benchmarks/README.rst b/python/ray/util/sgd/torch/examples/benchmarks/README.rst new file mode 100644 index 000000000..42aa31f96 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/benchmarks/README.rst @@ -0,0 +1,162 @@ +Running benchmarks +================== + +RaySGD provides comparable or better performance than other existing solutions for parallel or distributed training. + +You can run ``ray/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py`` for benchmarking the RaySGD TorchTrainer implementation. To benchmark training on a multi-node multi-gpu cluster, you can use the `Ray Autoscaler `_. + +DISCLAIMER: RaySGD does not provide any custom communication primitives. If you see any performance issues, you may need to file them on the PyTorch github repository. + +Single Node Results +------------------- + +Here are benchmarking results comparing the following: + +* torch.nn.DataParallel +* torch.nn.Parallel with ``apex.amp`` enabled (``O1``) +* Ray (wrapping Pytorch DistributedDataParallel) +* Ray (wrapping Pytorch DistributedDataParallel) with ``apex.amp`` enabled (``O1``) + +on synthetic ImageNet data (via ``benchmark.py`` and ``dp_benchmark.py``) as of 03/04/2020. + +Framework versions used: + +* PyTorch Version: torch-1.4.0-cp36-cp36m +* Torchvision Version: torchvision-0.5.0-cp36-cp36m +* Apex Version: commit hash 5633f6d + +.. code-block:: + + # Images per second for ResNet50 + # Batch size per worker = 128 + # GPU Type = V100 + # Run on AWS us-east-1c, p3dn.24xlarge instance. + + + Number DataParallel Ray (PyTorch) DataParallel Ray (PyTorch) + of GPUs + Apex + Apex + ======= ============ ============= ============ ============== + 1 2769.7 5143 2962.7 6172 + 2 5492.2 9463 5886.1 10052.8 + 4 10733.4 18807 11705.9 20319.5 + 8 21872.5 36911.8 23317.9 38642 + + +.. image:: raysgd_multigpu_benchmark.png + :scale: 30% + :align: center + + +Multi Node Results +------------------ + +Here are benchmarking results comparing the following: + +* Horovod +* Horovod with ``apex.amp`` enabled (``O1``) +* Pytorch DistributedDataParallel +* Pytorch DistributedDataParallel with ``apex.amp`` enabled (``O1``) + +on synthetic ImageNet data (via ``benchmark.py`` and ``horovod_benchmark_apex.py``) as of 03/04/2020. + + +Framework versions used: + +* PyTorch Version: torch-1.4.0-cp36-cp36m +* Torchvision Version: torchvision-0.5.0-cp36-cp36m +* Apex Version: commit hash 5633f6d +* Horovod Version: horovod-0.19.0 + +.. code-block:: bash + + # Images per second for ResNet50 + # Batch size per worker = 128 + # GPU Type = V100 + # Run on AWS us-east-1c, p3dn.24xlarge instances. + + Number Horovod Ray (PyTorch) Horovod Ray (PyTorch) + of GPUs + Apex + Apex + ======= ======= ============= ======= ============== + 1 * 8 2769.7 5143 2962.7 6172 + 2 * 8 5492.2 9463 5886.1 10052.8 + 4 * 8 10733.4 18807 11705.9 20319.5 + 8 * 8 21872.5 36911.8 23317.9 38642 + + +.. image:: raysgd_multinode_benchmark.png + :scale: 30% + :align: center + + +Simple Instructions +------------------- + +Note that these instructions are not maintained and may require a bit of wrangling to get working. + +First, ``git clone https://github.com/ray-project/ray && cd ray/python/ray/util/sgd/torch/examples/``. + +You can use ``sgd-development.yaml`` to setup your cluster configuration and ``ray up sgd-development.yaml`` to launch the cluster. + +You can specify the number of nodes you want to use with the following configuration: + +.. code-block:: + + # The maximum number of workers nodes to launch in addition to the head + # node. This takes precedence over min_workers. min_workers default to 0. + min_workers: # Change this to a custom quantity + initial_workers: # same as above + max_workers: # same as above + +You may want to install FP16 support for PyTorch with the following configuration in the YAML file: + +.. code-block:: yaml + + setup_commands: + - ray || pip install -U ray[rllib] + - pip install -U ipdb torch torchvision + # Install apex, but continue if this command fails. + # For faster installation purposes, we do not install the apex cpp bindings + # The cpp bindings can improve your benchmarked performance. + - git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true + +You should then run ``ray monitor sgd-development.yaml`` to monitor the progress of the cluster setup. When the cluster is done setting up, you should see something like the following: + +.. code-block:: bash + + 2020-03-05 01:24:53,613 INFO log_timer.py:17 -- AWSNodeProvider: Set tag ray-node-status=up-to-date on ['i-07ba946522fcb1d3d'] [LogTimer=134ms] + 2020-03-05 01:24:53,734 INFO log_timer.py:17 -- AWSNodeProvider: Set tag ray-runtime-config=c12bae3df69d4d6a207e90948dc4bf763319d7ed on ['i-07ba946522fcb1d3d'] [LogTimer=121ms] + 2020-03-05 01:24:58,475 INFO autoscaler.py:733 -- StandardAutoscaler: 7/7 target nodes (0 pending) + 2020-03-05 01:24:58,476 INFO autoscaler.py:734 -- LoadMetrics: MostDelayedHeartbeats={'172.31.38.189': 0.21588897705078125, '172.31.38.95': 0.21587467193603516, '172.31.42.196': 0.21586227416992188, '172.31.34.227': 0.2158496379852295, '172.31.42.101': 0.2158372402191162}, NodeIdleSeconds=Min=6 Mean=27 Max=40, NumNodesConnected=8, NumNodesUsed=0.0, ResourceUsage=0.0/512.0 CPU, 0.0/64.0 GPU, 0.0 GiB/4098.67 GiB memory, 0.0/1.0 node:172.31.34.227, 0.0/1.0 node:172.31.36.8, 0.0/1.0 node:172.31.36.82, 0.0/1.0 node:172.31.38.189, 0.0/1.0 node:172.31.38.95, 0.0/1.0 node:172.31.42.101, 0.0/1.0 node:172.31.42.196, 0.0/1.0 node:172.31.45.185, 0.0 GiB/5.45 GiB object_store_memory, TimeSinceLastHeartbeat=Min=0 Mean=0 Max=0 + +You can then launch a synthetic benchmark run with the following command: + +.. code-block:: bash + + $ ray submit sgd-development.yaml benchmarks/benchmark.py --args="--batch-size 128" + + # Or with apex fp16 + $ ray submit sgd-development.yaml benchmarks/benchmark.py --args="--batch-size 128 --use-fp16" + +You should see something like: + +.. code-block:: bash + + Model: resnet50 + Batch size: 128 + Number of GPUs: 16 + Iter #0: 354.2 img/sec per GPU + Iter #1: 354.0 img/sec per GPU + Iter #2: 353.0 img/sec per GPU + Iter #3: 353.3 img/sec per GPU + Iter #4: 352.8 img/sec per GPU + Iter #5: 348.5 img/sec per GPU + Iter #6: 352.5 img/sec per GPU + Iter #7: 352.5 img/sec per GPU + Iter #8: 352.1 img/sec per GPU + Iter #9: 352.2 img/sec per GPU + Img/sec per GPU: 352.5 +-3.0 + Total img/sec on 16 GPU(s): 5640.2 +-47.2 + + +You can run ``ray up benchmarks/horovod-benchmark.yaml`` to launch an AWS cluster that sets up Horovod on each machine. +See ``https://github.com/horovod/horovod`` for launching Horovod training. ``horovod_benchmark_apex.py`` can be used with ``horovodrun`` to obtain benchmarking results. diff --git a/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py b/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py new file mode 100644 index 000000000..6155d9e21 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/benchmarks/benchmark.py @@ -0,0 +1,126 @@ +from __future__ import print_function + +import argparse +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torchvision import models +import timeit +import numpy as np + +import ray +from ray.util.sgd import TorchTrainer +from ray.util.sgd.torch import TrainingOperator + +# Benchmark settings +parser = argparse.ArgumentParser( + description="PyTorch Synthetic Benchmark", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument( + "--fp16", action="store_true", default=False, help="use fp16 training") + +parser.add_argument( + "--model", type=str, default="resnet50", help="model to benchmark") +parser.add_argument( + "--batch-size", type=int, default=32, help="input batch size") + +parser.add_argument( + "--num-warmup-batches", + type=int, + default=10, + help="number of warm-up batches that don't count towards benchmark") +parser.add_argument( + "--num-batches-per-iter", + type=int, + default=10, + help="number of batches per benchmark iteration") +parser.add_argument( + "--num-iters", type=int, default=10, help="number of benchmark iterations") + +parser.add_argument( + "--no-cuda", + action="store_true", + default=False, + help="Disables CUDA training") +parser.add_argument( + "--local", + action="store_true", + default=False, + help="Disables cluster training") + +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() +device = "GPU" if args.cuda else "CPU" + + +def init_hook(): + import torch.backends.cudnn as cudnn + cudnn.benchmark = True + + +class Training(TrainingOperator): + def setup(self, config): + data = torch.randn(args.batch_size, 3, 224, 224) + target = torch.LongTensor(args.batch_size).random_() % 1000 + if args.cuda: + data, target = data.cuda(), target.cuda() + + self.data, self.target = data, target + + def train_epoch(self, *pargs, **kwargs): + # print(self.model) + def benchmark(): + self.optimizer.zero_grad() + output = self.model(self.data) + loss = F.cross_entropy(output, self.target) + loss.backward() + self.optimizer.step() + + # print("Running warmup...") + if self.global_step == 0: + timeit.timeit(benchmark, number=args.num_warmup_batches) + self.global_step += 1 + # print("Running benchmark...") + time = timeit.timeit(benchmark, number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + return {"img_sec": img_sec} + + +if __name__ == "__main__": + ray.init(address=None if args.local else "auto") + num_workers = 2 if args.local else int(ray.cluster_resources().get(device)) + from ray.util.sgd.torch.examples.train_example import LinearDataset + + print("Model: %s" % args.model) + print("Batch size: %d" % args.batch_size) + print("Number of %ss: %d" % (device, num_workers)) + + trainer = TorchTrainer( + model_creator=lambda cfg: getattr(models, args.model)(), + optimizer_creator=lambda model, cfg: optim.SGD( + model.parameters(), lr=0.01 * cfg.get("lr_scaler")), + data_creator=lambda cfg: LinearDataset(4, 2), + initialization_hook=init_hook, + config=dict( + lr_scaler=num_workers), + training_operator_cls=Training, + num_workers=num_workers, + use_gpu=args.cuda, + use_fp16=args.fp16, + ) + + img_secs = [] + for x in range(args.num_iters): + result = trainer.train() + # print(result) + img_sec = result["img_sec"] + print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device)) + img_secs.append(img_sec) + + # Results + img_sec_mean = np.mean(img_secs) + img_sec_conf = 1.96 * np.std(img_secs) + print("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf)) + print("Total img/sec on %d %s(s): %.1f +-%.1f" % + (num_workers, device, num_workers * img_sec_mean, + num_workers * img_sec_conf)) diff --git a/python/ray/util/sgd/torch/examples/benchmarks/dp_benchmark.py b/python/ray/util/sgd/torch/examples/benchmarks/dp_benchmark.py new file mode 100644 index 000000000..80fd80a5b --- /dev/null +++ b/python/ray/util/sgd/torch/examples/benchmarks/dp_benchmark.py @@ -0,0 +1,106 @@ +from __future__ import print_function + +import argparse +import timeit +import torch.backends.cudnn as cudnn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torch.nn import DataParallel +from torchvision import models +import numpy as np +import os +# Apex +from apex import amp + +# Benchmark settings +parser = argparse.ArgumentParser( + description="PyTorch DP Synthetic Benchmark", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument( + "--fp16-allreduce", + action="store_true", + default=False, + help="use fp16 compression during allreduce") + +parser.add_argument( + "--model", type=str, default="resnet50", help="model to benchmark") +parser.add_argument( + "--batch-size", type=int, default=32, help="input batch size") +parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus") + +parser.add_argument( + "--num-warmup-batches", + type=int, + default=10, + help="number of warm-up batches that don\"t count towards benchmark") +parser.add_argument( + "--num-batches-per-iter", + type=int, + default=10, + help="number of batches per benchmark iteration") +parser.add_argument( + "--num-iters", type=int, default=10, help="number of benchmark iterations") +parser.add_argument( + "--amp-fp16", + action="store_true", + default=False, + help="Enables FP16 training with Apex.") + +args = parser.parse_args() +os.environ["CUDA_VISIBLE_DEVICES"] = ",".join( + str(i) for i in range(args.num_gpus)) + +cudnn.benchmark = True + +# Set up standard model. +model = getattr(models, args.model)().cuda() +model = DataParallel(model) + +optimizer = optim.SGD(model.parameters(), lr=0.01) + +# Apex +if args.amp_fp16: + model, optimizer = amp.initialize(model, optimizer, opt_level="O1") + +# Set up fixed fake data +data = torch.randn(args.batch_size, 3, 224, 224) +target = torch.LongTensor(args.batch_size).random_() % 1000 +data, target = data.cuda(), target.cuda() + + +def benchmark_step(): + optimizer.zero_grad() + output = model(data) + loss = F.cross_entropy(output, target) + loss.backward() + optimizer.step() + + +print("Model: %s" % args.model) +print("Batch size: %d" % args.batch_size) +device = "GPU" +print("Number of %ss: %d" % (device, args.num_gpus)) + +# Warm-up +print("Running warmup...") +timeit.timeit(benchmark_step, number=args.num_warmup_batches) + +# Benchmark +print("Running benchmark...") +img_secs = [] +for x in range(args.num_iters): + time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + print("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device)) + img_secs.append(img_sec) + +# Results +img_sec_mean = np.mean(img_secs) +img_sec_conf = 1.96 * np.std(img_secs) +print("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf)) +print("Total img/sec on %d %s(s): %.1f +-%.1f" % ( + args.num_gpus, + device, + img_sec_mean, # we do NOT scale this by number workers + args.num_gpus * img_sec_conf)) diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml new file mode 100644 index 000000000..b72651500 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod-benchmark.yaml @@ -0,0 +1,85 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: horovod-pytorch + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. min_workers default to 0. +min_workers: 1 +initial_workers: 1 +max_workers: 1 + +target_utilization_fraction: 0.9 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 50 +# docker: +# image: tensorflow/tensorflow:1.5.0-py3 +# container_name: ray_docker + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-east-1 + availability_zone: us-east-1c + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu + + +head_node: + InstanceType: p3dn.24xlarge + ImageId: ami-0698bcaf8bd9ef56d + InstanceMarketOptions: + MarketType: spot + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 250 + # SpotOptions: + # MaxPrice: "9.0" + + +worker_nodes: + InstanceType: p3dn.24xlarge + ImageId: ami-0698bcaf8bd9ef56d + InstanceMarketOptions: + MarketType: spot + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 250 + # SpotOptions: + # MaxPrice: "9.0" + # # Run workers on spot by default. Comment this out to use on-demand. + # InstanceMarketOptions: + # MarketType: spot + +setup_commands: + - pip install torch torchvision ipdb + - pip install ray[rllib] # enable autoscaling + - git clone https://github.com/horovod/horovod || true + - git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true + - tmux new -d -s my-session "HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL pip install horovod" + + +file_mounts: {} + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - cat ~/ray_bootstrap_key.pem > ~/.ssh/id_rsa + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: + - pip install horovod + +# # Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=1000000000 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + # - nvidia-docker run -it --network=host -d --rm -p 4321:22 horovod:latest bash -c "pip install Pillow==6.1; sleep infinity" + diff --git a/python/ray/util/sgd/torch/examples/benchmarks/horovod_benchmark_apex.py b/python/ray/util/sgd/torch/examples/benchmarks/horovod_benchmark_apex.py new file mode 100644 index 000000000..251511da2 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/benchmarks/horovod_benchmark_apex.py @@ -0,0 +1,144 @@ +from __future__ import print_function + +import argparse +import torch.backends.cudnn as cudnn +import torch.nn.functional as F +import torch.optim as optim +import torch.utils.data.distributed +from torchvision import models +import horovod.torch as hvd +import timeit +import numpy as np +# Apex +from apex import amp + +# Benchmark settings +parser = argparse.ArgumentParser( + description="PyTorch Synthetic Benchmark", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) +parser.add_argument( + "--fp16-allreduce", + action="store_true", + default=False, + help="use fp16 compression during allreduce") + +parser.add_argument( + "--model", type=str, default="resnet50", help="model to benchmark") +parser.add_argument( + "--batch-size", type=int, default=32, help="input batch size") + +parser.add_argument( + "--num-warmup-batches", + type=int, + default=10, + help="number of warm-up batches that don\"t count towards benchmark") +parser.add_argument( + "--num-batches-per-iter", + type=int, + default=10, + help="number of batches per benchmark iteration") +parser.add_argument( + "--num-iters", type=int, default=10, help="number of benchmark iterations") + +parser.add_argument( + "--no-cuda", + action="store_true", + default=False, + help="disables CUDA training") +parser.add_argument( + "--amp-fp16", + action="store_true", + default=False, + help="Enables FP16 training with Apex.") + +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +hvd.init() + +if args.cuda: + # Horovod: pin GPU to local rank. + torch.cuda.set_device(hvd.local_rank()) + +cudnn.benchmark = True + +# Set up standard model. +model = getattr(models, args.model)() + +if args.cuda: + # Move model to GPU. + model.cuda() + +optimizer = optim.SGD(model.parameters(), lr=0.01) + +# Horovod: (optional) compression algorithm. +compression = (hvd.Compression.fp16 + if args.fp16_allreduce else hvd.Compression.none) + +# Horovod: wrap optimizer with DistributedOptimizer. +optimizer = hvd.DistributedOptimizer( + optimizer, + named_parameters=model.named_parameters(), + compression=compression) + +# Horovod: broadcast parameters & optimizer state. +hvd.broadcast_parameters(model.state_dict(), root_rank=0) +hvd.broadcast_optimizer_state(optimizer, root_rank=0) + +# Apex +if args.amp_fp16: + model, optimizer = amp.initialize(model, optimizer, opt_level="O1") + +# Set up fixed fake data +data = torch.randn(args.batch_size, 3, 224, 224) +target = torch.LongTensor(args.batch_size).random_() % 1000 +if args.cuda: + data, target = data.cuda(), target.cuda() + + +def benchmark_step(): + optimizer.zero_grad() + output = model(data) + loss = F.cross_entropy(output, target) + # Apex + if args.amp_fp16: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + optimizer.synchronize() + with optimizer.skip_synchronize(): + optimizer.step() + else: + loss.backward() + optimizer.step() + + +def log(s, nl=True): + if hvd.rank() != 0: + return + print(s, end="\n" if nl else "") + + +log("Model: %s" % args.model) +log("Batch size: %d" % args.batch_size) +device = "GPU" if args.cuda else "CPU" +log("Number of %ss: %d" % (device, hvd.size())) + +# Warm-up +log("Running warmup...") +timeit.timeit(benchmark_step, number=args.num_warmup_batches) + +# Benchmark +log("Running benchmark...") +img_secs = [] +for x in range(args.num_iters): + time = timeit.timeit(benchmark_step, number=args.num_batches_per_iter) + img_sec = args.batch_size * args.num_batches_per_iter / time + log("Iter #%d: %.1f img/sec per %s" % (x, img_sec, device)) + img_secs.append(img_sec) + +# Results +img_sec_mean = np.mean(img_secs) +img_sec_conf = 1.96 * np.std(img_secs) +log("Img/sec per %s: %.1f +-%.1f" % (device, img_sec_mean, img_sec_conf)) +log("Total img/sec on %d %s(s): %.1f +-%.1f" % + (hvd.size(), device, hvd.size() * img_sec_mean, hvd.size() * img_sec_conf)) diff --git a/python/ray/util/sgd/torch/examples/benchmarks/raysgd_multigpu_benchmark.png b/python/ray/util/sgd/torch/examples/benchmarks/raysgd_multigpu_benchmark.png new file mode 100644 index 000000000..325891d95 Binary files /dev/null and b/python/ray/util/sgd/torch/examples/benchmarks/raysgd_multigpu_benchmark.png differ diff --git a/python/ray/util/sgd/torch/examples/benchmarks/raysgd_multinode_benchmark.png b/python/ray/util/sgd/torch/examples/benchmarks/raysgd_multinode_benchmark.png new file mode 100644 index 000000000..f25785225 Binary files /dev/null and b/python/ray/util/sgd/torch/examples/benchmarks/raysgd_multinode_benchmark.png differ diff --git a/python/ray/util/sgd/torch/examples/sgd-development.yaml b/python/ray/util/sgd/torch/examples/sgd-development.yaml new file mode 100644 index 000000000..e6697a272 --- /dev/null +++ b/python/ray/util/sgd/torch/examples/sgd-development.yaml @@ -0,0 +1,94 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: sgd-pytorch + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. min_workers default to 0. +min_workers: 0 +initial_workers: 0 +max_workers: 0 + +target_utilization_fraction: 0.9 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 10 +# docker: +# image: tensorflow/tensorflow:1.5.0-py3 +# container_name: ray_docker + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-east-1 + availability_zone: us-east-1c + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu + # ssh_private_key: ... + +head_node: + InstanceType: p3dn.24xlarge + ImageId: ami-0698bcaf8bd9ef56d + # KeyName: ... + InstanceMarketOptions: + MarketType: spot + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 300 + # SpotOptions: + # MaxPrice: "9.0" + + +worker_nodes: + InstanceType: p3.16xlarge + ImageId: ami-0698bcaf8bd9ef56d + # KeyName: ... + InstanceMarketOptions: + MarketType: spot + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 300 + # SpotOptions: + # MaxPrice: "9.0" + # # Run workers on spot by default. Comment this out to use on-demand. + # InstanceMarketOptions: + # MarketType: spot + +setup_commands: + # This replaces the standard anaconda Ray installation + - ray || pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.9.0.dev0-cp36-cp36m-manylinux1_x86_64.whl + # Uncomment this and the filemount to update the Ray installation with your local Ray code + # - rm -rf ./anaconda3/lib/python3.6/site-packages/ray/util/sgd/ + # - cp -rf ~/sgd ./anaconda3/lib/python3.6/site-packages/ray/util/ + + # Installing this without -U to make sure we don't replace the existing Ray installation + - pip install ray[rllib] + - pip install -U ipdb torch torchvision + # Install Apex + - rm -rf apex || true + - git clone https://github.com/NVIDIA/apex && cd apex && pip install -v --no-cache-dir ./ || true + + +file_mounts: { + # This should point to ray/python/ray/util/sgd. + # ~/sgd: ../../../sgd, +} + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: [] + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# # Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml --object-store-memory=1000000000 + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 --object-store-memory=1000000000 + diff --git a/python/ray/util/sgd/torch/torch_runner.py b/python/ray/util/sgd/torch/torch_runner.py index bf812dfb7..022801e7d 100644 --- a/python/ray/util/sgd/torch/torch_runner.py +++ b/python/ray/util/sgd/torch/torch_runner.py @@ -114,7 +114,7 @@ class TorchRunner: else: self.criterion = self.loss_creator(self.config) - if torch.cuda.is_available() and hasattr("cuda", self.criterion): + if torch.cuda.is_available() and hasattr(self.criterion, "cuda"): self.criterion = self.criterion.cuda() def _create_schedulers_if_available(self): diff --git a/python/ray/util/sgd/torch/torch_trainer.py b/python/ray/util/sgd/torch/torch_trainer.py index 4a638b1f7..269cab493 100644 --- a/python/ray/util/sgd/torch/torch_trainer.py +++ b/python/ray/util/sgd/torch/torch_trainer.py @@ -525,7 +525,6 @@ class TorchTrainer: return else: delay = 2**i - logger.info("Resources: {}".format(resources)) logger.warning( "No new workers found. Retrying in %d sec." % delay) time.sleep(delay) @@ -562,7 +561,6 @@ class TorchTrainable(Trainable): validation_stats = self._trainer.validate() train_stats.update(validation_stats) - # output {"mean_loss": test_loss, "mean_accuracy": accuracy} return train_stats def _save(self, checkpoint_dir):