From e4bea8d10effa45d1fc5b5cb897a1305950880b2 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sun, 30 Sep 2018 18:37:55 -0700
Subject: [PATCH] [rllib] Default to truncate_episodes and add some more config
 validators (#2967)

* update

* link it

* warn about truncation

* fix

* Update rllib-training.rst

* deprecate tests failing
---
 python/ray/rllib/agents/ppo/ppo.py            | 36 ++++++++++++-------
 .../ray/rllib/tuned_examples/hopper-ppo.yaml  |  1 +
 .../tuned_examples/humanoid-ppo-gae.yaml      |  1 +
 .../rllib/tuned_examples/humanoid-ppo.yaml    |  1 +
 .../rllib/tuned_examples/pendulum-ppo.yaml    |  2 +-
 .../regression_tests/cartpole-ppo.yaml        |  1 +
 .../regression_tests/pendulum-ppo.yaml        |  1 +
 .../rllib/tuned_examples/walker2d-ppo.yaml    |  1 +
 test/jenkins_tests/run_multi_node_tests.sh    |  8 +----
 9 files changed, 32 insertions(+), 20 deletions(-)

diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index f452f7893..d2a991929 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -48,7 +48,7 @@ DEFAULT_CONFIG = with_common_config({
     # Whether to allocate CPUs for workers (if > 0).
     "num_cpus_per_worker": 1,
     # Whether to rollout "complete_episodes" or "truncate_episodes"
-    "batch_mode": "complete_episodes",
+    "batch_mode": "truncate_episodes",
     # Which observation filter to apply to the observation
     "observation_filter": "MeanStdFilter",
     # Use the sync samples optimizer instead of the multi-gpu one
@@ -80,17 +80,7 @@ class PPOAgent(Agent):
             extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
 
     def _init(self):
-        waste_ratio = (
-            self.config["sample_batch_size"] * self.config["num_workers"] /
-            self.config["train_batch_size"])
-        if waste_ratio > 1:
-            msg = ("sample_batch_size * num_workers >> train_batch_size. "
-                   "This means that many steps will be discarded. Consider "
-                   "reducing sample_batch_size, or increase train_batch_size.")
-            if waste_ratio > 1.5:
-                raise ValueError(msg)
-            else:
-                print("Warning: " + msg)
+        self._validate_config()
         self.local_evaluator = self.make_local_evaluator(
             self.env_creator, self._policy_graph)
         self.remote_evaluators = self.make_remote_evaluators(
@@ -114,6 +104,28 @@ class PPOAgent(Agent):
                     "standardize_fields": ["advantages"],
                 })
 
+    def _validate_config(self):
+        waste_ratio = (
+            self.config["sample_batch_size"] * self.config["num_workers"] /
+            self.config["train_batch_size"])
+        if waste_ratio > 1:
+            msg = ("sample_batch_size * num_workers >> train_batch_size. "
+                   "This means that many steps will be discarded. Consider "
+                   "reducing sample_batch_size, or increase train_batch_size.")
+            if waste_ratio > 1.5:
+                raise ValueError(msg)
+            else:
+                print("Warning: " + msg)
+        if self.config["sgd_minibatch_size"] > self.config["train_batch_size"]:
+            raise ValueError(
+                "Minibatch size {} must be <= train batch size {}.".format(
+                    self.config["sgd_minibatch_size"],
+                    self.config["train_batch_size"]))
+        if (self.config["batch_mode"] == "truncate_episodes"
+                and not self.config["use_gae"]):
+            raise ValueError(
+                "Episode truncation is not supported without a value function")
+
     def _train(self):
         prev_steps = self.optimizer.num_steps_sampled
         fetches = self.optimizer.step()
diff --git a/python/ray/rllib/tuned_examples/hopper-ppo.yaml b/python/ray/rllib/tuned_examples/hopper-ppo.yaml
index c1c75b166..5082dc792 100644
--- a/python/ray/rllib/tuned_examples/hopper-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/hopper-ppo.yaml
@@ -10,3 +10,4 @@ hopper-ppo:
         train_batch_size: 160000
         num_workers: 64
         num_gpus: 4
+        batch_mode: complete_episodes
diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
index e176dcae2..9473b5df7 100644
--- a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
@@ -17,3 +17,4 @@ humanoid-ppo-gae:
             free_log_std: true
         num_workers: 64
         num_gpus: 4
+        batch_mode: complete_episodes
diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
index 0608f8b60..07371d16f 100644
--- a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
@@ -15,3 +15,4 @@ humanoid-ppo:
         use_gae: false
         num_workers: 64
         num_gpus: 4
+        batch_mode: complete_episodes
diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
index 60df6825b..b8c0293a3 100644
--- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
@@ -13,4 +13,4 @@ pendulum-ppo:
         num_sgd_iter: 10
         model:
             fcnet_hiddens: [64, 64]
-            squash_to_range: True
+        batch_mode: complete_episodes
diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml
index 425958e5c..82ea5846e 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml
@@ -6,3 +6,4 @@ cartpole-ppo:
         time_total_s: 300
     config:
         num_workers: 1
+        batch_mode: complete_episodes
diff --git a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml
index 8b9d69fce..63536d3be 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml
@@ -15,3 +15,4 @@ pendulum-ppo:
         num_sgd_iter: 10
         model:
             fcnet_hiddens: [64, 64]
+        batch_mode: complete_episodes
diff --git a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
index deb5a0038..9d64720a2 100644
--- a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
@@ -9,3 +9,4 @@ walker2d-v1-ppo:
         train_batch_size: 320000
         num_workers: 64
         num_gpus: 4
+        batch_mode: complete_episodes
diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh
index e12eca455..43815f470 100755
--- a/test/jenkins_tests/run_multi_node_tests.sh
+++ b/test/jenkins_tests/run_multi_node_tests.sh
@@ -58,7 +58,7 @@ docker run  -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \
     --env CartPole-v1 \
     --run PPO \
     --stop '{"training_iteration": 2}' \
-    --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "use_gae": false}'
+    --config '{"kl_coeff": 1.0, "num_sgd_iter": 10, "lr": 1e-4, "sgd_minibatch_size": 64, "train_batch_size": 2000, "num_workers": 1, "use_gae": false, "batch_mode": "complete_episodes"}'
 
 docker run  -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \
     python /ray/python/ray/rllib/train.py \
@@ -288,12 +288,6 @@ docker run  -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \
     python /ray/python/ray/tune/examples/genetic_example.py \
     --smoke-test
 
-docker run  -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \
-    python /ray/python/ray/rllib/examples/legacy_multiagent/multiagent_mountaincar.py
-
-docker run  -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \
-    python /ray/python/ray/rllib/examples/legacy_multiagent/multiagent_pendulum.py
-
 docker run  -e "RAY_USE_XRAY=1" --rm --shm-size=10G --memory=10G $DOCKER_SHA \
     python /ray/python/ray/rllib/examples/multiagent_cartpole.py --num-iters=2