From b5799b528654daa6afe582295b0c006e793d68fe Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 1 Mar 2019 13:19:33 -0800
Subject: [PATCH] [rllib] Set PPO observation filter to NoFilter by default
 (#4191)

---
 python/ray/rllib/agents/agent.py              |  1 +
 python/ray/rllib/agents/ppo/ppo.py            | 21 ++++++++++---------
 .../rllib/tuned_examples/halfcheetah-ppo.yaml |  1 +
 .../ray/rllib/tuned_examples/hopper-ppo.yaml  |  1 +
 .../tuned_examples/humanoid-ppo-gae.yaml      |  1 +
 .../rllib/tuned_examples/humanoid-ppo.yaml    |  1 +
 .../tuned_examples/hyperband-cartpole.yaml    |  1 +
 .../rllib/tuned_examples/pendulum-ppo.yaml    |  1 +
 .../regression_tests/cartpole-ppo.yaml        |  1 +
 .../regression_tests/pendulum-ppo.yaml        |  1 +
 .../rllib/tuned_examples/walker2d-ppo.yaml    |  1 +
 11 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py
index 25ecd4ae8..829c020f5 100644
--- a/python/ray/rllib/agents/agent.py
+++ b/python/ray/rllib/agents/agent.py
@@ -332,6 +332,7 @@ class Agent(Trainable):
         merged_config = deep_update(merged_config, config,
                                     self._allow_unknown_configs,
                                     self._allow_unknown_subkeys)
+        self.raw_user_config = config
         self.config = merged_config
         Agent._validate_config(self.config)
         if self.config.get("log_level"):
diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py
index 3798f63ff..790aa4b65 100644
--- a/python/ray/rllib/agents/ppo/ppo.py
+++ b/python/ray/rllib/agents/ppo/ppo.py
@@ -51,7 +51,7 @@ DEFAULT_CONFIG = with_common_config({
     # Whether to rollout "complete_episodes" or "truncate_episodes"
     "batch_mode": "truncate_episodes",
     # Which observation filter to apply to the observation
-    "observation_filter": "MeanStdFilter",
+    "observation_filter": "NoFilter",
     # Uses the sync samples optimizer instead of the multi-gpu one. This does
     # not support minibatches.
     "simple_optimizer": False,
@@ -99,6 +99,14 @@ class PPOAgent(Agent):
 
     @override(Agent)
     def _train(self):
+        if "observation_filter" not in self.raw_user_config:
+            # TODO(ekl) remove this message after a few releases
+            logger.info(
+                "Important! Since 0.7.0, observation normalization is no "
+                "longer enabled by default. To enable running-mean "
+                "normalization, set 'observation_filter': 'MeanStdFilter'. "
+                "You can ignore this message if your environment doesn't "
+                "require observation normalization.")
         prev_steps = self.optimizer.num_steps_sampled
         fetches = self.optimizer.step()
         if "kl" in fetches:
@@ -139,7 +147,6 @@ class PPOAgent(Agent):
                 "{} iterations for your value ".format(rew_scale) +
                 "function to converge. If this is not intended, consider "
                 "increasing `vf_clip_param`.")
-
         return res
 
     def _validate_config(self):
@@ -159,13 +166,7 @@ class PPOAgent(Agent):
                 "In multi-agent mode, policies will be optimized sequentially "
                 "by the multi-GPU optimizer. Consider setting "
                 "simple_optimizer=True if this doesn't work for you.")
-        if self.config["observation_filter"] != "NoFilter":
-            logger.warning(
-                "By default, observations will be normalized with {}. ".format(
-                    self.config["observation_filter"]) +
-                "If you are using image or discrete type observations, "
-                "consider disabling this with observation_filter=NoFilter.")
         if not self.config["vf_share_layers"]:
             logger.warning(
-                "By default, the value function will NOT share layers with "
-                "the policy model (vf_share_layers=False).")
+                "FYI: By default, the value function will not share layers "
+                "with the policy model ('vf_share_layers': False).")
diff --git a/python/ray/rllib/tuned_examples/halfcheetah-ppo.yaml b/python/ray/rllib/tuned_examples/halfcheetah-ppo.yaml
index d154e7c29..60cbd03dc 100644
--- a/python/ray/rllib/tuned_examples/halfcheetah-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/halfcheetah-ppo.yaml
@@ -20,3 +20,4 @@ halfcheetah-ppo:
         num_envs_per_worker: 
             grid_search: [16, 32]
         batch_mode: truncate_episodes
+        observation_filter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/hopper-ppo.yaml b/python/ray/rllib/tuned_examples/hopper-ppo.yaml
index 5082dc792..c73d4480e 100644
--- a/python/ray/rllib/tuned_examples/hopper-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/hopper-ppo.yaml
@@ -11,3 +11,4 @@ hopper-ppo:
         num_workers: 64
         num_gpus: 4
         batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
index 9473b5df7..c9ddbd017 100644
--- a/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-ppo-gae.yaml
@@ -18,3 +18,4 @@ humanoid-ppo-gae:
         num_workers: 64
         num_gpus: 4
         batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
index 07371d16f..b531531e9 100644
--- a/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/humanoid-ppo.yaml
@@ -16,3 +16,4 @@ humanoid-ppo:
         num_workers: 64
         num_gpus: 4
         batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml b/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml
index 64d5571db..8c0a510bc 100644
--- a/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml
+++ b/python/ray/rllib/tuned_examples/hyperband-cartpole.yaml
@@ -11,3 +11,4 @@ cartpole-ppo:
             grid_search: [1, 4]
         sgd_minibatch_size:
             grid_search: [128, 256, 512]
+        observation_fliter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
index 3e9d45179..2dafa9300 100644
--- a/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-ppo.yaml
@@ -15,3 +15,4 @@ pendulum-ppo:
         model:
             fcnet_hiddens: [64, 64]
         batch_mode: complete_episodes
+        observation_fliter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml
index 3f326cf83..d34b35280 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ppo.yaml
@@ -7,3 +7,4 @@ cartpole-ppo:
     config:
         num_workers: 1
         batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml
index 015429110..b6bfbaf79 100644
--- a/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/regression_tests/pendulum-ppo.yaml
@@ -17,3 +17,4 @@ pendulum-ppo:
         model:
             fcnet_hiddens: [64, 64]
         batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
diff --git a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
index 9d64720a2..a88589ebf 100644
--- a/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
+++ b/python/ray/rllib/tuned_examples/walker2d-ppo.yaml
@@ -10,3 +10,4 @@ walker2d-v1-ppo:
         num_workers: 64
         num_gpus: 4
         batch_mode: complete_episodes
+        observation_filter: MeanStdFilter