mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 05:34:49 +08:00
[tune] horovod trainable (#10304)
This commit is contained in:
@@ -327,6 +327,14 @@ py_test(
|
||||
deps = [":tune_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_horovod",
|
||||
size = "medium",
|
||||
srcs = ["tests/test_horovod.py"],
|
||||
tags = ["exclusive", "example", "py37"],
|
||||
deps = [":tune_lib"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "ddp_mnist_torch",
|
||||
size = "small",
|
||||
@@ -364,6 +372,15 @@ py_test(
|
||||
args = ["--smoke-test"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "horovod_simple",
|
||||
size = "medium",
|
||||
srcs = ["examples/horovod_simple.py"],
|
||||
tags = ["exclusive", "example", "py37"],
|
||||
deps = [":tune_lib"],
|
||||
args = ["--smoke-test"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "hyperband_example",
|
||||
size = "medium",
|
||||
|
||||
@@ -0,0 +1,115 @@
|
||||
import torch
|
||||
import numpy as np
|
||||
from ray import tune
|
||||
from ray.tune.integration.horovod import DistributedTrainableCreator
|
||||
import time
|
||||
|
||||
|
||||
def sq(x):
|
||||
m2 = 1.
|
||||
m1 = -20.
|
||||
m0 = 50.
|
||||
return m2 * x * x + m1 * x + m0
|
||||
|
||||
|
||||
def qu(x):
|
||||
m3 = 10.
|
||||
m2 = 5.
|
||||
m1 = -20.
|
||||
m0 = -5.
|
||||
return m3 * x * x * x + m2 * x * x + m1 * x + m0
|
||||
|
||||
|
||||
class Net(torch.nn.Module):
|
||||
def __init__(self, mode="sq"):
|
||||
super(Net, self).__init__()
|
||||
|
||||
if mode == "square":
|
||||
self.mode = 0
|
||||
self.param = torch.nn.Parameter(torch.FloatTensor([1., -1.]))
|
||||
else:
|
||||
self.mode = 1
|
||||
self.param = torch.nn.Parameter(torch.FloatTensor([1., -1., 1.]))
|
||||
|
||||
def forward(self, x):
|
||||
if ~self.mode:
|
||||
return x * x + self.param[0] * x + self.param[1]
|
||||
else:
|
||||
return_val = 10 * x * x * x
|
||||
return_val += self.param[0] * x * x
|
||||
return_val += self.param[1] * x + self.param[2]
|
||||
return return_val
|
||||
|
||||
|
||||
def train(config):
|
||||
import torch
|
||||
import horovod.torch as hvd
|
||||
hvd.init()
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
net = Net(args.mode).to(device)
|
||||
optimizer = torch.optim.SGD(
|
||||
net.parameters(),
|
||||
lr=config["lr"],
|
||||
)
|
||||
optimizer = hvd.DistributedOptimizer(optimizer)
|
||||
|
||||
num_steps = 5
|
||||
print(hvd.size())
|
||||
np.random.seed(1 + hvd.rank())
|
||||
torch.manual_seed(1234)
|
||||
# To ensure consistent initialization across slots,
|
||||
hvd.broadcast_parameters(net.state_dict(), root_rank=0)
|
||||
hvd.broadcast_optimizer_state(optimizer, root_rank=0)
|
||||
|
||||
start = time.time()
|
||||
for step in range(1, num_steps + 1):
|
||||
features = torch.Tensor(
|
||||
np.random.rand(1) * 2 * args.x_max - args.x_max).to(device)
|
||||
if args.mode == "square":
|
||||
labels = sq(features)
|
||||
else:
|
||||
labels = qu(features)
|
||||
optimizer.zero_grad()
|
||||
outputs = net(features)
|
||||
loss = torch.nn.MSELoss()(outputs, labels)
|
||||
loss.backward()
|
||||
|
||||
optimizer.step()
|
||||
time.sleep(0.1)
|
||||
tune.report(loss=loss.item())
|
||||
total = time.time() - start
|
||||
print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--mode", type=str, default="square", choices=["square", "cubic"])
|
||||
parser.add_argument(
|
||||
"--learning_rate", type=float, default=0.1, dest="learning_rate")
|
||||
parser.add_argument("--x_max", type=float, default=1., dest="x_max")
|
||||
parser.add_argument("--gpu", action="store_true")
|
||||
parser.add_argument(
|
||||
"--smoke-test",
|
||||
action="store_true",
|
||||
help=("Finish quickly for testing."))
|
||||
parser.add_argument("--hosts-per-trial", type=int, default=1)
|
||||
parser.add_argument("--slots-per-host", type=int, default=2)
|
||||
args = parser.parse_args()
|
||||
|
||||
# import ray
|
||||
# ray.init(address="auto") # assumes ray is started with ray up
|
||||
|
||||
horovod_trainable = DistributedTrainableCreator(
|
||||
train,
|
||||
use_gpu=args.gpu,
|
||||
num_hosts=args.hosts_per_trial,
|
||||
num_slots=args.slots_per_host,
|
||||
replicate_pem=False)
|
||||
analysis = tune.run(
|
||||
horovod_trainable,
|
||||
config={"lr": tune.uniform(0.1, 1)},
|
||||
num_samples=2 if args.smoke_test else 10,
|
||||
fail_fast=True)
|
||||
config = analysis.get_best_config(metric="loss", mode="min")
|
||||
@@ -0,0 +1,226 @@
|
||||
import os
|
||||
import logging
|
||||
from filelock import FileLock
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tune.resources import Resources
|
||||
from ray.tune.trainable import TrainableUtil
|
||||
from ray.tune.result import RESULT_DUPLICATE
|
||||
from ray.tune.logger import NoopLogger
|
||||
|
||||
from ray.tune.function_runner import wrap_function
|
||||
from horovod.ray import RayExecutor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_rank():
|
||||
return os.environ["HOROVOD_RANK"]
|
||||
|
||||
|
||||
def logger_creator(log_config, logdir):
|
||||
"""Simple NOOP logger for worker trainables."""
|
||||
index = get_rank()
|
||||
worker_dir = os.path.join(logdir, "worker_{}".format(index))
|
||||
os.makedirs(worker_dir, exist_ok=True)
|
||||
return NoopLogger(log_config, worker_dir)
|
||||
|
||||
|
||||
class _HorovodTrainable(tune.Trainable):
|
||||
"""Abstract Trainable class for Horovod."""
|
||||
# Callable function for training.
|
||||
_function = None
|
||||
# Number of hosts (nodes) to allocate per trial
|
||||
_num_hosts: int = 1
|
||||
# Number of workers (slots) to place on each host.
|
||||
_num_slots: int = 1
|
||||
# Number of CPU resources to reserve for each worker.
|
||||
_num_cpus_per_slot: int = 1
|
||||
# Whether to reserve and pass GPU resources through.
|
||||
_use_gpu: bool = False
|
||||
# bool: Whether a the function has completed training
|
||||
_finished: bool = False
|
||||
|
||||
# Horovod settings
|
||||
_ssh_str: str = None
|
||||
_ssh_identity_file: str = None
|
||||
_timeout_s: int = 30
|
||||
|
||||
@property
|
||||
def num_workers(self):
|
||||
return self._num_hosts * self._num_slots
|
||||
|
||||
def setup(self, config):
|
||||
trainable = wrap_function(self.__class__._function)
|
||||
# We use a filelock here to ensure that the file-writing
|
||||
# process is safe across different trainables.
|
||||
if self._ssh_identity_file:
|
||||
with FileLock(self._ssh_identity_file + ".lock"):
|
||||
settings = RayExecutor.create_settings(
|
||||
self._timeout_s, self._ssh_identity_file, self._ssh_str)
|
||||
else:
|
||||
settings = RayExecutor.create_settings(
|
||||
self._timeout_s, self._ssh_identity_file, self._ssh_str)
|
||||
|
||||
self.executor = RayExecutor(
|
||||
settings,
|
||||
cpus_per_slot=self._num_cpus_per_slot,
|
||||
use_gpu=self._use_gpu,
|
||||
num_hosts=self._num_hosts,
|
||||
num_slots=self._num_slots)
|
||||
|
||||
# We can't put `self` in the lambda closure, so we
|
||||
# resolve the variable ahead of time.
|
||||
logdir_ = str(self.logdir)
|
||||
|
||||
# Starts the workers as specified by the resources above.
|
||||
self.executor.start(
|
||||
executable_cls=trainable,
|
||||
executable_kwargs={
|
||||
"config": config,
|
||||
"logger_creator": lambda cfg: logger_creator(cfg, logdir_)
|
||||
})
|
||||
|
||||
def step(self):
|
||||
if self._finished:
|
||||
raise RuntimeError("Training has already finished.")
|
||||
result = self.executor.execute(lambda w: w.step())[0]
|
||||
if RESULT_DUPLICATE in result:
|
||||
self._finished = True
|
||||
return result
|
||||
|
||||
def save_checkpoint(self, checkpoint_dir):
|
||||
# TODO: optimize if colocated
|
||||
save_obj = self.executor.execute_single(lambda w: w.save_to_object())
|
||||
checkpoint_path = TrainableUtil.create_from_pickle(
|
||||
save_obj, checkpoint_dir)
|
||||
return checkpoint_path
|
||||
|
||||
def load_checkpoint(self, checkpoint_dir):
|
||||
checkpoint_obj = TrainableUtil.checkpoint_to_object(checkpoint_dir)
|
||||
x_id = ray.put(checkpoint_obj)
|
||||
return self.executor.execute(lambda w: w.restore_from_object(x_id))
|
||||
|
||||
def stop(self):
|
||||
self.executor.execute(lambda w: w.stop())
|
||||
self.executor.shutdown()
|
||||
|
||||
|
||||
def DistributedTrainableCreator(func,
|
||||
use_gpu=False,
|
||||
num_hosts=1,
|
||||
num_slots=1,
|
||||
num_cpus_per_slot=1,
|
||||
timeout_s=30,
|
||||
replicate_pem=False):
|
||||
"""Converts Horovod functions to be executable by Tune.
|
||||
|
||||
Requires horovod > 0.19 to work.
|
||||
|
||||
This function wraps and sets the resources for a given Horovod
|
||||
function to be used with Tune. It generates a Horovod Trainable (trial)
|
||||
which can itself be a distributed training job. One basic assumption of
|
||||
this implementation is that all sub-workers
|
||||
of a trial will be placed evenly across different machines.
|
||||
|
||||
It is recommended that if `num_hosts` per trial > 1, you set
|
||||
num_slots == the size (or number of GPUs) of a single host.
|
||||
If num_hosts == 1, then you can set num_slots to be <=
|
||||
the size (number of GPUs) of a single host.
|
||||
|
||||
This above assumption can be relaxed - please file a feature request
|
||||
on Github to inform the maintainers.
|
||||
|
||||
Another assumption is that this API requires gloo as the underlying
|
||||
communication primitive. You will need to install Horovod with
|
||||
`HOROVOD_WITH_GLOO` enabled.
|
||||
|
||||
*Fault Tolerance:* The trial workers themselves are not fault tolerant.
|
||||
When a host of a trial fails, all workers of a trial are expected to
|
||||
die, and the trial is expected to restart. This currently does not
|
||||
support function checkpointing.
|
||||
|
||||
Args:
|
||||
func (Callable[[dict], None]): A training function that takes in
|
||||
a config dict for hyperparameters and should initialize
|
||||
horovod via horovod.init.
|
||||
use_gpu (bool); Whether to allocate a GPU per worker.
|
||||
num_cpus_per_slot (int): Number of CPUs to request
|
||||
from Ray per worker.
|
||||
num_hosts (int): Number of hosts that each trial is expected
|
||||
to use.
|
||||
num_slots (int): Number of slots (workers) to start on each host.
|
||||
timeout_s (int): Seconds for Horovod rendezvous to timeout.
|
||||
replicate_pem (bool): THIS MAY BE INSECURE. If true, this will
|
||||
replicate the underlying Ray cluster ssh key across all hosts.
|
||||
This may be useful if using the Ray Autoscaler.
|
||||
|
||||
|
||||
Returns:
|
||||
Trainable class that can be passed into `tune.run`.
|
||||
|
||||
Example:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
def train(config):
|
||||
horovod.init()
|
||||
horovod.allreduce()
|
||||
|
||||
from ray.tune.integration.horovod import DistributedTrainableCreator
|
||||
trainable_cls = DistributedTrainableCreator(
|
||||
train, num_hosts=1, num_slots=2, use_gpu=True)
|
||||
|
||||
tune.run(trainable_cls)
|
||||
|
||||
.. versionadded:: 1.0.0
|
||||
"""
|
||||
ssh_identity_file = None
|
||||
sshkeystr = None
|
||||
|
||||
if replicate_pem:
|
||||
from ray.tune.cluster_info import get_ssh_key
|
||||
ssh_identity_file = get_ssh_key()
|
||||
if os.path.exists(ssh_identity_file):
|
||||
# For now, we assume that you're on a Ray cluster.
|
||||
with open(ssh_identity_file) as f:
|
||||
sshkeystr = f.read()
|
||||
|
||||
class WrappedHorovodTrainable(_HorovodTrainable):
|
||||
_function = func
|
||||
_num_hosts = num_hosts
|
||||
_num_slots = num_slots
|
||||
_num_cpus_per_slot = num_cpus_per_slot
|
||||
_use_gpu = use_gpu
|
||||
_ssh_identity_file = ssh_identity_file
|
||||
_ssh_str = sshkeystr
|
||||
_timeout_s = timeout_s
|
||||
|
||||
@classmethod
|
||||
def default_resource_request(cls, config):
|
||||
extra_gpu = int(num_hosts * num_slots) * int(use_gpu)
|
||||
extra_cpu = int(num_hosts * num_slots * num_cpus_per_slot)
|
||||
|
||||
return Resources(
|
||||
cpu=0,
|
||||
gpu=0,
|
||||
extra_cpu=extra_cpu,
|
||||
extra_gpu=extra_gpu,
|
||||
)
|
||||
|
||||
return WrappedHorovodTrainable
|
||||
|
||||
|
||||
# pytest presents a bunch of serialization problems
|
||||
# that force us to include mocks as part of the module.
|
||||
|
||||
|
||||
def _train_simple(config):
|
||||
import horovod.torch as hvd
|
||||
hvd.init()
|
||||
from ray import tune
|
||||
for i in range(config.get("epochs", 2)):
|
||||
import time
|
||||
time.sleep(1)
|
||||
tune.report(test=1, rank=hvd.rank())
|
||||
@@ -235,8 +235,8 @@ class TuneReportCheckpointCallback(TuneCallback):
|
||||
.. code-block:: python
|
||||
|
||||
import pytorch_lightning as pl
|
||||
from ray.tune.integration.pytorch_lightning import \
|
||||
TuneReportCheckpointCallback
|
||||
from ray.tune.integration.pytorch_lightning import (
|
||||
TuneReportCheckpointCallback)
|
||||
|
||||
# Save checkpoint after each training batch and after each
|
||||
# validation epoch.
|
||||
|
||||
@@ -0,0 +1,97 @@
|
||||
import pytest
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
|
||||
pytest.importorskip("horovod")
|
||||
|
||||
try:
|
||||
from ray.tune.integration.horovod import (DistributedTrainableCreator,
|
||||
_train_simple)
|
||||
except ImportError:
|
||||
pass # This shouldn't be reached - the test should be skipped.
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_start_2_cpus():
|
||||
address_info = ray.init(num_cpus=2)
|
||||
yield address_info
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_start_4_cpus():
|
||||
address_info = ray.init(num_cpus=4)
|
||||
yield address_info
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ray_connect_cluster():
|
||||
try:
|
||||
address_info = ray.init(address="auto")
|
||||
except Exception as e:
|
||||
pytest.skip(str(e))
|
||||
yield address_info
|
||||
# The code after the yield will run as teardown code.
|
||||
ray.shutdown()
|
||||
|
||||
|
||||
def test_single_step(ray_start_2_cpus):
|
||||
trainable_cls = DistributedTrainableCreator(
|
||||
_train_simple, num_hosts=1, num_slots=2)
|
||||
trainer = trainable_cls()
|
||||
trainer.train()
|
||||
trainer.stop()
|
||||
|
||||
|
||||
def test_step_after_completion(ray_start_2_cpus):
|
||||
trainable_cls = DistributedTrainableCreator(
|
||||
_train_simple, num_hosts=1, num_slots=2)
|
||||
trainer = trainable_cls(config={"epochs": 1})
|
||||
with pytest.raises(RuntimeError):
|
||||
for i in range(10):
|
||||
trainer.train()
|
||||
|
||||
|
||||
def test_validation(ray_start_2_cpus):
|
||||
def bad_func(a, b, c):
|
||||
return 1
|
||||
|
||||
t_cls = DistributedTrainableCreator(bad_func, num_slots=2)
|
||||
with pytest.raises(ValueError):
|
||||
t_cls()
|
||||
|
||||
|
||||
def test_set_global(ray_start_2_cpus):
|
||||
trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2)
|
||||
trainable = trainable_cls()
|
||||
result = trainable.train()
|
||||
trainable.stop()
|
||||
assert result["rank"] == 0
|
||||
|
||||
|
||||
def test_simple_tune(ray_start_4_cpus):
|
||||
trainable_cls = DistributedTrainableCreator(_train_simple, num_slots=2)
|
||||
analysis = tune.run(
|
||||
trainable_cls, num_samples=2, stop={"training_iteration": 2})
|
||||
assert analysis.trials[0].last_result["training_iteration"] == 2
|
||||
|
||||
|
||||
@pytest.mark.parametrize("use_gpu", [True, False])
|
||||
def test_resource_tune(ray_connect_cluster, use_gpu):
|
||||
if use_gpu and ray.cluster_resources().get("GPU", 0) == 0:
|
||||
pytest.skip("No GPU available.")
|
||||
trainable_cls = DistributedTrainableCreator(
|
||||
_train_simple, num_slots=2, use_gpu=use_gpu)
|
||||
analysis = tune.run(
|
||||
trainable_cls, num_samples=2, stop={"training_iteration": 2})
|
||||
assert analysis.trials[0].last_result["training_iteration"] == 2
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
Reference in New Issue
Block a user