mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 01:40:21 +08:00
[tune] Allow resources to not all be assigned to the driver (#1150)
* dgpu * update * update * update * also support cmdline * limit * Update README.rst * documentation * typo * small coverage for driver_gpu_limit * lint * fix lint
This commit is contained in:
@@ -7,6 +7,7 @@ cartpole-ppo:
|
||||
time_total_s: 180
|
||||
resources:
|
||||
cpu: 2
|
||||
driver_cpu_limit: 1
|
||||
config:
|
||||
num_workers: 2
|
||||
num_sgd_iter:
|
||||
|
||||
@@ -5,4 +5,6 @@ hopper-ppo:
|
||||
resources:
|
||||
cpu: 64
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
driver_gpu_limit: 4
|
||||
config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
|
||||
|
||||
@@ -3,6 +3,7 @@ humanoid-es:
|
||||
alg: ES
|
||||
resources:
|
||||
cpu: 100
|
||||
driver_cpu_limit: 4
|
||||
stop:
|
||||
episode_reward_mean: 6000
|
||||
config:
|
||||
|
||||
@@ -7,5 +7,6 @@ humanoid-ppo-gae:
|
||||
resources:
|
||||
cpu: 64
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
|
||||
|
||||
|
||||
@@ -7,4 +7,5 @@ humanoid-ppo:
|
||||
resources:
|
||||
cpu: 64
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
|
||||
|
||||
@@ -3,6 +3,7 @@ pong-a3c:
|
||||
alg: A3C
|
||||
resources:
|
||||
cpu: 16
|
||||
driver_cpu_limit: 1
|
||||
config:
|
||||
num_workers: 16
|
||||
num_batches_per_iteration: 1000
|
||||
|
||||
@@ -5,4 +5,5 @@ walker2d-v1-ppo:
|
||||
resources:
|
||||
cpu: 64
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
|
||||
|
||||
@@ -130,7 +130,8 @@ expression.
|
||||
episode_reward_mean: 200
|
||||
time_total_s: 180
|
||||
resources:
|
||||
cpu: 4
|
||||
cpu: 5
|
||||
driver_cpu_limit: 1 # of the 5 CPUs, only 1 is used by the driver
|
||||
config:
|
||||
num_workers: 4
|
||||
num_sgd_iter:
|
||||
|
||||
@@ -14,7 +14,9 @@ from ray.tune.trial import Trial, Resources
|
||||
|
||||
def _resource_json(data):
|
||||
values = json.loads(data)
|
||||
return Resources(values.get('cpu', 0), values.get('gpu', 0))
|
||||
return Resources(
|
||||
values.get('cpu', 0), values.get('gpu', 0),
|
||||
values.get('driver_cpu_limit'), values.get('driver_gpu_limit'))
|
||||
|
||||
|
||||
def make_parser(description):
|
||||
|
||||
@@ -10,8 +10,32 @@ from collections import namedtuple
|
||||
from ray.rllib.agent import get_agent_class
|
||||
|
||||
|
||||
# Ray resources required to schedule a Trial
|
||||
Resources = namedtuple("Resources", ["cpu", "gpu"])
|
||||
class Resources(
|
||||
namedtuple("Resources", [
|
||||
"cpu", "gpu", "driver_cpu_limit", "driver_gpu_limit"])):
|
||||
"""Ray resources required to schedule a trial.
|
||||
|
||||
Attributes:
|
||||
cpu (int): Number of CPUs required for the trial total.
|
||||
gpu (int): Number of GPUs required for the trial total.
|
||||
driver_cpu_limit (int): Max CPUs allocated to the driver.
|
||||
Defaults to all of the required CPUs.
|
||||
driver_gpu_limit (int): Max GPUs allocated to the driver.
|
||||
Defaults to all of the required GPUs.
|
||||
"""
|
||||
__slots__ = ()
|
||||
|
||||
def __new__(cls, cpu, gpu, driver_cpu_limit=None, driver_gpu_limit=None):
|
||||
if driver_cpu_limit is not None:
|
||||
assert driver_cpu_limit <= cpu
|
||||
else:
|
||||
driver_cpu_limit = cpu
|
||||
if driver_gpu_limit is not None:
|
||||
assert driver_gpu_limit <= gpu
|
||||
else:
|
||||
driver_gpu_limit = gpu
|
||||
return super(Resources, cls).__new__(
|
||||
cls, cpu, gpu, driver_cpu_limit, driver_gpu_limit)
|
||||
|
||||
|
||||
class Trial(object):
|
||||
@@ -22,6 +46,9 @@ class Trial(object):
|
||||
|
||||
Trials start in the PENDING state, and transition to RUNNING once started.
|
||||
On error it transitions to ERROR, otherwise TERMINATED on success.
|
||||
|
||||
The driver for the trial will be allocated at most `driver_cpu_limit` and
|
||||
`driver_gpu_limit` CPUs and GPUs.
|
||||
"""
|
||||
|
||||
PENDING = "PENDING"
|
||||
@@ -206,8 +233,8 @@ class Trial(object):
|
||||
self.status = Trial.RUNNING
|
||||
agent_cls = get_agent_class(self.alg)
|
||||
cls = ray.remote(
|
||||
num_cpus=self.resources.cpu, num_gpus=self.resources.gpu)(
|
||||
agent_cls)
|
||||
num_cpus=self.resources.driver_cpu_limit,
|
||||
num_gpus=self.resources.driver_gpu_limit)(agent_cls)
|
||||
self.agent = cls.remote(
|
||||
self.env_creator, self.config, self.local_dir, self.upload_dir,
|
||||
experiment_tag=self.experiment_tag)
|
||||
|
||||
Reference in New Issue
Block a user