[tune] Allow resources to not all be assigned to the driver (#1150)

* dgpu

* update

* update

* update

* also support cmdline

* limit

* Update README.rst

* documentation

* typo

* small coverage for driver_gpu_limit

* lint

* fix lint
This commit is contained in:
Eric Liang
2017-10-28 22:16:05 -07:00
committed by Richard Liaw
parent f59867850e
commit 3b157ab933
10 changed files with 44 additions and 6 deletions
@@ -7,6 +7,7 @@ cartpole-ppo:
time_total_s: 180
resources:
cpu: 2
driver_cpu_limit: 1
config:
num_workers: 2
num_sgd_iter:
@@ -5,4 +5,6 @@ hopper-ppo:
resources:
cpu: 64
gpu: 4
driver_cpu_limit: 4
driver_gpu_limit: 4
config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
@@ -3,6 +3,7 @@ humanoid-es:
alg: ES
resources:
cpu: 100
driver_cpu_limit: 4
stop:
episode_reward_mean: 6000
config:
@@ -7,5 +7,6 @@ humanoid-ppo-gae:
resources:
cpu: 64
gpu: 4
driver_cpu_limit: 4
config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
@@ -7,4 +7,5 @@ humanoid-ppo:
resources:
cpu: 64
gpu: 4
driver_cpu_limit: 4
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
@@ -3,6 +3,7 @@ pong-a3c:
alg: A3C
resources:
cpu: 16
driver_cpu_limit: 1
config:
num_workers: 16
num_batches_per_iteration: 1000
@@ -5,4 +5,5 @@ walker2d-v1-ppo:
resources:
cpu: 64
gpu: 4
driver_cpu_limit: 4
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
+2 -1
View File
@@ -130,7 +130,8 @@ expression.
episode_reward_mean: 200
time_total_s: 180
resources:
cpu: 4
cpu: 5
driver_cpu_limit: 1 # of the 5 CPUs, only 1 is used by the driver
config:
num_workers: 4
num_sgd_iter:
+3 -1
View File
@@ -14,7 +14,9 @@ from ray.tune.trial import Trial, Resources
def _resource_json(data):
values = json.loads(data)
return Resources(values.get('cpu', 0), values.get('gpu', 0))
return Resources(
values.get('cpu', 0), values.get('gpu', 0),
values.get('driver_cpu_limit'), values.get('driver_gpu_limit'))
def make_parser(description):
+31 -4
View File
@@ -10,8 +10,32 @@ from collections import namedtuple
from ray.rllib.agent import get_agent_class
# Ray resources required to schedule a Trial
Resources = namedtuple("Resources", ["cpu", "gpu"])
class Resources(
namedtuple("Resources", [
"cpu", "gpu", "driver_cpu_limit", "driver_gpu_limit"])):
"""Ray resources required to schedule a trial.
Attributes:
cpu (int): Number of CPUs required for the trial total.
gpu (int): Number of GPUs required for the trial total.
driver_cpu_limit (int): Max CPUs allocated to the driver.
Defaults to all of the required CPUs.
driver_gpu_limit (int): Max GPUs allocated to the driver.
Defaults to all of the required GPUs.
"""
__slots__ = ()
def __new__(cls, cpu, gpu, driver_cpu_limit=None, driver_gpu_limit=None):
if driver_cpu_limit is not None:
assert driver_cpu_limit <= cpu
else:
driver_cpu_limit = cpu
if driver_gpu_limit is not None:
assert driver_gpu_limit <= gpu
else:
driver_gpu_limit = gpu
return super(Resources, cls).__new__(
cls, cpu, gpu, driver_cpu_limit, driver_gpu_limit)
class Trial(object):
@@ -22,6 +46,9 @@ class Trial(object):
Trials start in the PENDING state, and transition to RUNNING once started.
On error it transitions to ERROR, otherwise TERMINATED on success.
The driver for the trial will be allocated at most `driver_cpu_limit` and
`driver_gpu_limit` CPUs and GPUs.
"""
PENDING = "PENDING"
@@ -206,8 +233,8 @@ class Trial(object):
self.status = Trial.RUNNING
agent_cls = get_agent_class(self.alg)
cls = ray.remote(
num_cpus=self.resources.cpu, num_gpus=self.resources.gpu)(
agent_cls)
num_cpus=self.resources.driver_cpu_limit,
num_gpus=self.resources.driver_gpu_limit)(agent_cls)
self.agent = cls.remote(
self.env_creator, self.config, self.local_dir, self.upload_dir,
experiment_tag=self.experiment_tag)