From b41bdcefa0351a4bf77f0c46917663b5014a364b Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 20 Mar 2018 19:29:52 -0700 Subject: [PATCH] [rllib] Update RLlib to work with new actor scheduling behavior (#1754) * Mon Mar 19 21:23:01 PDT 2018 * Mon Mar 19 21:23:07 PDT 2018 * Mon Mar 19 21:30:49 PDT 2018 * Mon Mar 19 21:32:05 PDT 2018 * Mon Mar 19 21:35:43 PDT 2018 * fix cpu limits * Mon Mar 19 22:25:07 PDT 2018 --- python/ray/rllib/dqn/apex.py | 4 +--- python/ray/rllib/dqn/dqn.py | 8 +------- python/ray/rllib/optimizers/apex_optimizer.py | 17 ++++++++++------- .../cartpole-grid-search-example.yaml | 2 +- python/ray/rllib/tuned_examples/hopper-ppo.yaml | 4 ++-- .../ray/rllib/tuned_examples/humanoid-es.yaml | 4 ++-- .../rllib/tuned_examples/humanoid-ppo-gae.yaml | 4 ++-- .../ray/rllib/tuned_examples/humanoid-ppo.yaml | 4 ++-- .../ray/rllib/tuned_examples/pendulum-ppo.yaml | 4 +++- python/ray/rllib/tuned_examples/pong-apex.yaml | 4 ++-- python/ray/rllib/tuned_examples/pong-ppo.yaml | 4 +++- .../ray/rllib/tuned_examples/walker2d-ppo.yaml | 4 ++-- 12 files changed, 31 insertions(+), 32 deletions(-) diff --git a/python/ray/rllib/dqn/apex.py b/python/ray/rllib/dqn/apex.py index 0ec1778fc..90ec8b8be 100644 --- a/python/ray/rllib/dqn/apex.py +++ b/python/ray/rllib/dqn/apex.py @@ -22,7 +22,6 @@ APEX_DEFAULT_CONFIG = dict(DQN_CONFIG, **dict( timesteps_per_iteration=25000, per_worker_exploration=True, worker_side_prioritization=True, - force_evaluators_remote=False, # consider enabling for large clusters )) @@ -30,8 +29,7 @@ class ApexAgent(DQNAgent): """DQN variant that uses the Ape-X distributed policy optimizer. By default, this is configured for a large single node (32 cores). For - running in a large cluster, increase `num_workers` and consider setting - `force_evaluators_remote` to move workers off of the head node. + running in a large cluster, increase the `num_workers` config var. """ _agent_name = "APEX" diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py index 100225474..fdea65415 100644 --- a/python/ray/rllib/dqn/dqn.py +++ b/python/ray/rllib/dqn/dqn.py @@ -11,7 +11,6 @@ import tensorflow as tf import ray from ray.rllib import optimizers from ray.rllib.dqn.dqn_evaluator import DQNEvaluator -from ray.rllib.utils.actors import drop_colocated from ray.rllib.agent import Agent from ray.tune.result import TrainingResult @@ -114,9 +113,7 @@ DEFAULT_CONFIG = dict( # Whether to use a distribution of epsilons across workers for exploration. per_worker_exploration=False, # Whether to compute priorities on workers. - worker_side_prioritization=False, - # Whether to force evaluator actors to be placed on remote machines. - force_evaluators_remote=False) + worker_side_prioritization=False) class DQNAgent(Agent): @@ -137,9 +134,6 @@ class DQNAgent(Agent): i) for i in range(self.config["num_workers"])] - if self.config["force_evaluators_remote"]: - self.remote_evaluators = drop_colocated(self.remote_evaluators) - for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer_config"]: self.config["optimizer_config"][k] = self.config[k] diff --git a/python/ray/rllib/optimizers/apex_optimizer.py b/python/ray/rllib/optimizers/apex_optimizer.py index ded738f62..dc9a9752f 100644 --- a/python/ray/rllib/optimizers/apex_optimizer.py +++ b/python/ray/rllib/optimizers/apex_optimizer.py @@ -18,7 +18,7 @@ import ray from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer from ray.rllib.optimizers.replay_buffer import PrioritizedReplayBuffer from ray.rllib.optimizers.sample_batch import SampleBatch -from ray.rllib.utils.actors import TaskPool, create_colocated +from ray.rllib.utils.actors import TaskPool from ray.rllib.utils.timer import TimerStat from ray.rllib.utils.window_stat import WindowStat @@ -163,12 +163,15 @@ class ApexOptimizer(PolicyOptimizer): self.learner = LearnerThread(self.local_evaluator) self.learner.start() - self.replay_actors = create_colocated( - ReplayActor, - [num_replay_buffer_shards, learning_starts, buffer_size, - train_batch_size, prioritized_replay_alpha, - prioritized_replay_beta, prioritized_replay_eps, clip_rewards], - num_replay_buffer_shards) + # TODO(ekl) use create_colocated() for these actors once + # https://github.com/ray-project/ray/issues/1734 is fixed + self.replay_actors = [ + ReplayActor.remote( + num_replay_buffer_shards, learning_starts, buffer_size, + train_batch_size, prioritized_replay_alpha, + prioritized_replay_beta, prioritized_replay_eps, clip_rewards) + for _ in range(num_replay_buffer_shards) + ] assert len(self.remote_evaluators) > 0 # Stats diff --git a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml index c5033c712..7aa56af31 100644 --- a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml +++ b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml @@ -5,7 +5,7 @@ cartpole-ppo: episode_reward_mean: 200 time_total_s: 180 resources: - cpu: 2 + cpu: 3 driver_cpu_limit: 1 config: num_workers: 2 diff --git a/python/ray/rllib/tuned_examples/hopper-ppo.yaml b/python/ray/rllib/tuned_examples/hopper-ppo.yaml index b256a119d..cf9f35e96 100644 --- a/python/ray/rllib/tuned_examples/hopper-ppo.yaml +++ b/python/ray/rllib/tuned_examples/hopper-ppo.yaml @@ -2,8 +2,8 @@ hopper-ppo: env: Hopper-v1 run: PPO resources: - cpu: 64 + cpu: 65 gpu: 4 - driver_cpu_limit: 4 + driver_cpu_limit: 1 driver_gpu_limit: 4 config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64} diff --git a/python/ray/rllib/tuned_examples/humanoid-es.yaml b/python/ray/rllib/tuned_examples/humanoid-es.yaml index a3cda3ca7..793363eee 100644 --- a/python/ray/rllib/tuned_examples/humanoid-es.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-es.yaml @@ -2,8 +2,8 @@ humanoid-es: env: Humanoid-v1 run: ES resources: - cpu: 100 - driver_cpu_limit: 4 + cpu: 101 + driver_cpu_limit: 1 stop: episode_reward_mean: 6000 config: diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml index b7ce6c1cc..007c6349a 100644 --- a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml @@ -4,8 +4,8 @@ humanoid-ppo-gae: stop: episode_reward_mean: 6000 resources: - cpu: 64 + cpu: 65 gpu: 4 - driver_cpu_limit: 4 + driver_cpu_limit: 1 config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false} diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml index c58f96bca..a5f55b1a4 100644 --- a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml +++ b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml @@ -4,7 +4,7 @@ humanoid-ppo: stop: episode_reward_mean: 6000 resources: - cpu: 64 + cpu: 65 gpu: 4 - driver_cpu_limit: 4 + driver_cpu_limit: 1 config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false} diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml index abad14ff1..9e8ad9221 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml @@ -3,9 +3,11 @@ pendulum-ppo: env: Pendulum-v0 run: PPO resources: - cpu: 4 + cpu: 5 + driver_cpu_limit: 1 config: timesteps_per_batch: 2048 + num_workers: 4 lambda: 0.1 gamma: 0.95 sgd_stepsize: 0.0003 diff --git a/python/ray/rllib/tuned_examples/pong-apex.yaml b/python/ray/rllib/tuned_examples/pong-apex.yaml index d63b2dea1..1eaa10480 100644 --- a/python/ray/rllib/tuned_examples/pong-apex.yaml +++ b/python/ray/rllib/tuned_examples/pong-apex.yaml @@ -6,10 +6,10 @@ pong-apex: run: APEX resources: cpu: - eval: spec.config.num_workers + eval: 1 + spec.config.num_workers + driver_cpu_limit: 1 gpu: 1 config: - force_evaluators_remote: True # set to False if you're running on a single node target_network_update_freq: 50000 num_workers: 32 lr: .0001 diff --git a/python/ray/rllib/tuned_examples/pong-ppo.yaml b/python/ray/rllib/tuned_examples/pong-ppo.yaml index 4b23fa350..58956bc05 100644 --- a/python/ray/rllib/tuned_examples/pong-ppo.yaml +++ b/python/ray/rllib/tuned_examples/pong-ppo.yaml @@ -9,11 +9,13 @@ pong-deterministic-ppo: env: PongDeterministic-v4 run: PPO resources: - cpu: 6 + cpu: 5 gpu: 1 + driver_cpu_limit: 1 stop: episode_reward_mean: 21 config: gamma: 0.99 + num_workers: 4 num_sgd_iter: 20 devices: ["/gpu:0"] diff --git a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml index 4f712a79a..95fbeeb51 100644 --- a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml +++ b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml @@ -2,7 +2,7 @@ walker2d-v1-ppo: env: Walker2d-v1 run: PPO resources: - cpu: 64 + cpu: 65 gpu: 4 - driver_cpu_limit: 4 + driver_cpu_limit: 1 config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}