From b41bdcefa0351a4bf77f0c46917663b5014a364b Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Tue, 20 Mar 2018 19:29:52 -0700
Subject: [PATCH] [rllib] Update RLlib to work with new actor scheduling
 behavior (#1754)

* Mon Mar 19 21:23:01 PDT 2018

* Mon Mar 19 21:23:07 PDT 2018

* Mon Mar 19 21:30:49 PDT 2018

* Mon Mar 19 21:32:05 PDT 2018

* Mon Mar 19 21:35:43 PDT 2018

* fix cpu limits

* Mon Mar 19 22:25:07 PDT 2018
---
 python/ray/rllib/dqn/apex.py                    |  4 +---
 python/ray/rllib/dqn/dqn.py                     |  8 +-------
 python/ray/rllib/optimizers/apex_optimizer.py   | 17 ++++++++++-------
 .../cartpole-grid-search-example.yaml           |  2 +-
 python/ray/rllib/tuned_examples/hopper-ppo.yaml |  4 ++--
 .../ray/rllib/tuned_examples/humanoid-es.yaml   |  4 ++--
 .../rllib/tuned_examples/humanoid-ppo-gae.yaml  |  4 ++--
 .../ray/rllib/tuned_examples/humanoid-ppo.yaml  |  4 ++--
 .../ray/rllib/tuned_examples/pendulum-ppo.yaml  |  4 +++-
 python/ray/rllib/tuned_examples/pong-apex.yaml  |  4 ++--
 python/ray/rllib/tuned_examples/pong-ppo.yaml   |  4 +++-
 .../ray/rllib/tuned_examples/walker2d-ppo.yaml  |  4 ++--
 12 files changed, 31 insertions(+), 32 deletions(-)

diff --git a/python/ray/rllib/dqn/apex.py b/python/ray/rllib/dqn/apex.py
index 0ec1778fc..90ec8b8be 100644
--- a/python/ray/rllib/dqn/apex.py
+++ b/python/ray/rllib/dqn/apex.py
@@ -22,7 +22,6 @@ APEX_DEFAULT_CONFIG = dict(DQN_CONFIG, **dict(
     timesteps_per_iteration=25000,
     per_worker_exploration=True,
     worker_side_prioritization=True,
-    force_evaluators_remote=False,  # consider enabling for large clusters
 ))
 
 
@@ -30,8 +29,7 @@ class ApexAgent(DQNAgent):
     """DQN variant that uses the Ape-X distributed policy optimizer.
 
     By default, this is configured for a large single node (32 cores). For
-    running in a large cluster, increase `num_workers` and consider setting
-    `force_evaluators_remote` to move workers off of the head node.
+    running in a large cluster, increase the `num_workers` config var.
     """
 
     _agent_name = "APEX"
diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py
index 100225474..fdea65415 100644
--- a/python/ray/rllib/dqn/dqn.py
+++ b/python/ray/rllib/dqn/dqn.py
@@ -11,7 +11,6 @@ import tensorflow as tf
 import ray
 from ray.rllib import optimizers
 from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
-from ray.rllib.utils.actors import drop_colocated
 from ray.rllib.agent import Agent
 from ray.tune.result import TrainingResult
 
@@ -114,9 +113,7 @@ DEFAULT_CONFIG = dict(
     # Whether to use a distribution of epsilons across workers for exploration.
     per_worker_exploration=False,
     # Whether to compute priorities on workers.
-    worker_side_prioritization=False,
-    # Whether to force evaluator actors to be placed on remote machines.
-    force_evaluators_remote=False)
+    worker_side_prioritization=False)
 
 
 class DQNAgent(Agent):
@@ -137,9 +134,6 @@ class DQNAgent(Agent):
                 i)
             for i in range(self.config["num_workers"])]
 
-        if self.config["force_evaluators_remote"]:
-            self.remote_evaluators = drop_colocated(self.remote_evaluators)
-
         for k in OPTIMIZER_SHARED_CONFIGS:
             if k not in self.config["optimizer_config"]:
                 self.config["optimizer_config"][k] = self.config[k]
diff --git a/python/ray/rllib/optimizers/apex_optimizer.py b/python/ray/rllib/optimizers/apex_optimizer.py
index ded738f62..dc9a9752f 100644
--- a/python/ray/rllib/optimizers/apex_optimizer.py
+++ b/python/ray/rllib/optimizers/apex_optimizer.py
@@ -18,7 +18,7 @@ import ray
 from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
 from ray.rllib.optimizers.replay_buffer import PrioritizedReplayBuffer
 from ray.rllib.optimizers.sample_batch import SampleBatch
-from ray.rllib.utils.actors import TaskPool, create_colocated
+from ray.rllib.utils.actors import TaskPool
 from ray.rllib.utils.timer import TimerStat
 from ray.rllib.utils.window_stat import WindowStat
 
@@ -163,12 +163,15 @@ class ApexOptimizer(PolicyOptimizer):
         self.learner = LearnerThread(self.local_evaluator)
         self.learner.start()
 
-        self.replay_actors = create_colocated(
-            ReplayActor,
-            [num_replay_buffer_shards, learning_starts, buffer_size,
-             train_batch_size, prioritized_replay_alpha,
-             prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
-            num_replay_buffer_shards)
+        # TODO(ekl) use create_colocated() for these actors once
+        # https://github.com/ray-project/ray/issues/1734 is fixed
+        self.replay_actors = [
+            ReplayActor.remote(
+                num_replay_buffer_shards, learning_starts, buffer_size,
+                train_batch_size, prioritized_replay_alpha,
+                prioritized_replay_beta, prioritized_replay_eps, clip_rewards)
+            for _ in range(num_replay_buffer_shards)
+        ]
         assert len(self.remote_evaluators) > 0
 
         # Stats
diff --git a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml
index c5033c712..7aa56af31 100644
--- a/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml
+++ b/python/ray/rllib/tuned_examples/cartpole-grid-search-example.yaml
@@ -5,7 +5,7 @@ cartpole-ppo:
         episode_reward_mean: 200
         time_total_s: 180
     resources:
-        cpu: 2
+        cpu: 3
         driver_cpu_limit: 1
     config:
         num_workers: 2
diff --git a/python/ray/rllib/tuned_examples/hopper-ppo.yaml b/python/ray/rllib/tuned_examples/hopper-ppo.yaml
index b256a119d..cf9f35e96 100644
--- a/python/ray/rllib/tuned_examples/hopper-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/hopper-ppo.yaml
@@ -2,8 +2,8 @@ hopper-ppo:
     env: Hopper-v1
     run: PPO
     resources:
-        cpu: 64
+        cpu: 65
         gpu: 4
-        driver_cpu_limit: 4
+        driver_cpu_limit: 1
         driver_gpu_limit: 4
     config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
diff --git a/python/ray/rllib/tuned_examples/humanoid-es.yaml b/python/ray/rllib/tuned_examples/humanoid-es.yaml
index a3cda3ca7..793363eee 100644
--- a/python/ray/rllib/tuned_examples/humanoid-es.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-es.yaml
@@ -2,8 +2,8 @@ humanoid-es:
     env: Humanoid-v1
     run: ES
     resources:
-        cpu: 100
-        driver_cpu_limit: 4
+        cpu: 101
+        driver_cpu_limit: 1
     stop:
         episode_reward_mean: 6000
     config:
diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
index b7ce6c1cc..007c6349a 100644
--- a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
@@ -4,8 +4,8 @@ humanoid-ppo-gae:
     stop:
         episode_reward_mean: 6000
     resources:
-        cpu: 64
+        cpu: 65
         gpu: 4
-        driver_cpu_limit: 4
+        driver_cpu_limit: 1
     config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
 
diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
index c58f96bca..a5f55b1a4 100644
--- a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
@@ -4,7 +4,7 @@ humanoid-ppo:
     stop:
        episode_reward_mean: 6000
     resources:
-       cpu: 64
+       cpu: 65
        gpu: 4
-       driver_cpu_limit: 4
+       driver_cpu_limit: 1
     config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
index abad14ff1..9e8ad9221 100644
--- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
@@ -3,9 +3,11 @@ pendulum-ppo:
     env: Pendulum-v0
     run: PPO
     resources:
-        cpu: 4
+        cpu: 5
+        driver_cpu_limit: 1
     config:
         timesteps_per_batch: 2048
+        num_workers: 4
         lambda: 0.1
         gamma: 0.95
         sgd_stepsize: 0.0003
diff --git a/python/ray/rllib/tuned_examples/pong-apex.yaml b/python/ray/rllib/tuned_examples/pong-apex.yaml
index d63b2dea1..1eaa10480 100644
--- a/python/ray/rllib/tuned_examples/pong-apex.yaml
+++ b/python/ray/rllib/tuned_examples/pong-apex.yaml
@@ -6,10 +6,10 @@ pong-apex:
     run: APEX
     resources:
         cpu:
-            eval: spec.config.num_workers
+            eval: 1 + spec.config.num_workers
+        driver_cpu_limit: 1
         gpu: 1
     config:
-        force_evaluators_remote: True  # set to False if you're running on a single node
         target_network_update_freq: 50000
         num_workers: 32
         lr: .0001
diff --git a/python/ray/rllib/tuned_examples/pong-ppo.yaml b/python/ray/rllib/tuned_examples/pong-ppo.yaml
index 4b23fa350..58956bc05 100644
--- a/python/ray/rllib/tuned_examples/pong-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/pong-ppo.yaml
@@ -9,11 +9,13 @@ pong-deterministic-ppo:
     env: PongDeterministic-v4
     run: PPO
     resources:
-        cpu: 6
+        cpu: 5
         gpu: 1
+        driver_cpu_limit: 1
     stop:
         episode_reward_mean: 21
     config:
         gamma: 0.99
+        num_workers: 4
         num_sgd_iter: 20
         devices: ["/gpu:0"]
diff --git a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
index 4f712a79a..95fbeeb51 100644
--- a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
@@ -2,7 +2,7 @@ walker2d-v1-ppo:
     env: Walker2d-v1
     run: PPO
     resources:
-        cpu: 64
+        cpu: 65
         gpu: 4
-        driver_cpu_limit: 4
+        driver_cpu_limit: 1
     config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}