[tune] Allow resources to not all be assigned to the driver (#1150)

* dgpu * update * update * update * also support cmdline * limit * Update README.rst * documentation * typo * small coverage for driver_gpu_limit * lint * fix lint
2026-06-30 01:40:21 +08:00 · 2017-10-28 22:16:05 -07:00
parent f59867850e
commit 3b157ab933
10 changed files with 44 additions and 6 deletions
@@ -7,6 +7,7 @@ cartpole-ppo:
        time_total_s: 180
    resources:
        cpu: 2
+        driver_cpu_limit: 1
    config:
        num_workers: 2
        num_sgd_iter:
@@ -5,4 +5,6 @@ hopper-ppo:
    resources:
       cpu: 64
       gpu: 4
+       driver_cpu_limit: 4
+       driver_gpu_limit: 4
    config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
@@ -3,6 +3,7 @@ humanoid-es:
    alg: ES
    resources:
       cpu: 100
+       driver_cpu_limit: 4
    stop:
        episode_reward_mean: 6000
    config:
@@ -7,5 +7,6 @@ humanoid-ppo-gae:
    resources:
       cpu: 64
       gpu: 4
+       driver_cpu_limit: 4
    config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}

@@ -7,4 +7,5 @@ humanoid-ppo:
    resources:
       cpu: 64
       gpu: 4
+       driver_cpu_limit: 4
    config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
@@ -3,6 +3,7 @@ pong-a3c:
    alg: A3C
    resources:
       cpu: 16
+       driver_cpu_limit: 1
    config:
        num_workers: 16
        num_batches_per_iteration: 1000
@@ -5,4 +5,5 @@ walker2d-v1-ppo:
    resources:
       cpu: 64
       gpu: 4
+       driver_cpu_limit: 4
    config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
@@ -130,7 +130,8 @@ expression.
            episode_reward_mean: 200
            time_total_s: 180
        resources:
-            cpu: 4
+            cpu: 5
+            driver_cpu_limit: 1  # of the 5 CPUs, only 1 is used by the driver
        config:
            num_workers: 4
            num_sgd_iter:
@@ -14,7 +14,9 @@ from ray.tune.trial import Trial, Resources

 def _resource_json(data):
    values = json.loads(data)
-    return Resources(values.get('cpu', 0), values.get('gpu', 0))
+    return Resources(
+        values.get('cpu', 0), values.get('gpu', 0),
+        values.get('driver_cpu_limit'), values.get('driver_gpu_limit'))


 def make_parser(description):
@@ -10,8 +10,32 @@ from collections import namedtuple
 from ray.rllib.agent import get_agent_class


-# Ray resources required to schedule a Trial
-Resources = namedtuple("Resources", ["cpu", "gpu"])
+class Resources(
+        namedtuple("Resources", [
+            "cpu", "gpu", "driver_cpu_limit", "driver_gpu_limit"])):
+    """Ray resources required to schedule a trial.
+
+    Attributes:
+        cpu (int): Number of CPUs required for the trial total.
+        gpu (int): Number of GPUs required for the trial total.
+        driver_cpu_limit (int): Max CPUs allocated to the driver.
+            Defaults to all of the required CPUs.
+        driver_gpu_limit (int): Max GPUs allocated to the driver.
+            Defaults to all of the required GPUs.
+    """
+    __slots__ = ()
+
+    def __new__(cls, cpu, gpu, driver_cpu_limit=None, driver_gpu_limit=None):
+        if driver_cpu_limit is not None:
+            assert driver_cpu_limit <= cpu
+        else:
+            driver_cpu_limit = cpu
+        if driver_gpu_limit is not None:
+            assert driver_gpu_limit <= gpu
+        else:
+            driver_gpu_limit = gpu
+        return super(Resources, cls).__new__(
+            cls, cpu, gpu, driver_cpu_limit, driver_gpu_limit)


 class Trial(object):
@@ -22,6 +46,9 @@ class Trial(object):

    Trials start in the PENDING state, and transition to RUNNING once started.
    On error it transitions to ERROR, otherwise TERMINATED on success.
+
+    The driver for the trial will be allocated at most `driver_cpu_limit` and
+    `driver_gpu_limit` CPUs and GPUs.
    """

    PENDING = "PENDING"
@@ -206,8 +233,8 @@ class Trial(object):
        self.status = Trial.RUNNING
        agent_cls = get_agent_class(self.alg)
        cls = ray.remote(
-            num_cpus=self.resources.cpu, num_gpus=self.resources.gpu)(
-                agent_cls)
+            num_cpus=self.resources.driver_cpu_limit,
+            num_gpus=self.resources.driver_gpu_limit)(agent_cls)
        self.agent = cls.remote(
            self.env_creator, self.config, self.local_dir, self.upload_dir,
            experiment_tag=self.experiment_tag)