[rllib] Set PPO observation filter to NoFilter by default (#4191)

2026-06-28 10:17:19 +08:00 · 2019-03-01 13:19:33 -08:00
parent 11a28834fa
commit b5799b5286
11 changed files with 21 additions and 10 deletions
@@ -332,6 +332,7 @@ class Agent(Trainable):
        merged_config = deep_update(merged_config, config,
                                    self._allow_unknown_configs,
                                    self._allow_unknown_subkeys)
+        self.raw_user_config = config
        self.config = merged_config
        Agent._validate_config(self.config)
        if self.config.get("log_level"):
@@ -51,7 +51,7 @@ DEFAULT_CONFIG = with_common_config({
    # Whether to rollout "complete_episodes" or "truncate_episodes"
    "batch_mode": "truncate_episodes",
    # Which observation filter to apply to the observation
-    "observation_filter": "MeanStdFilter",
+    "observation_filter": "NoFilter",
    # Uses the sync samples optimizer instead of the multi-gpu one. This does
    # not support minibatches.
    "simple_optimizer": False,
@@ -99,6 +99,14 @@ class PPOAgent(Agent):

    @override(Agent)
    def _train(self):
+        if "observation_filter" not in self.raw_user_config:
+            # TODO(ekl) remove this message after a few releases
+            logger.info(
+                "Important! Since 0.7.0, observation normalization is no "
+                "longer enabled by default. To enable running-mean "
+                "normalization, set 'observation_filter': 'MeanStdFilter'. "
+                "You can ignore this message if your environment doesn't "
+                "require observation normalization.")
        prev_steps = self.optimizer.num_steps_sampled
        fetches = self.optimizer.step()
        if "kl" in fetches:
@@ -139,7 +147,6 @@ class PPOAgent(Agent):
                "{} iterations for your value ".format(rew_scale) +
                "function to converge. If this is not intended, consider "
                "increasing `vf_clip_param`.")
-
        return res

    def _validate_config(self):
@@ -159,13 +166,7 @@ class PPOAgent(Agent):
                "In multi-agent mode, policies will be optimized sequentially "
                "by the multi-GPU optimizer. Consider setting "
                "simple_optimizer=True if this doesn't work for you.")
-        if self.config["observation_filter"] != "NoFilter":
-            logger.warning(
-                "By default, observations will be normalized with {}. ".format(
-                    self.config["observation_filter"]) +
-                "If you are using image or discrete type observations, "
-                "consider disabling this with observation_filter=NoFilter.")
        if not self.config["vf_share_layers"]:
            logger.warning(
-                "By default, the value function will NOT share layers with "
-                "the policy model (vf_share_layers=False).")
+                "FYI: By default, the value function will not share layers "
+                "with the policy model ('vf_share_layers': False).")
@@ -20,3 +20,4 @@ halfcheetah-ppo:
        num_envs_per_worker: 
            grid_search: [16, 32]
        batch_mode: truncate_episodes
+        observation_filter: MeanStdFilter
@@ -11,3 +11,4 @@ hopper-ppo:
        num_workers: 64
        num_gpus: 4
        batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
@@ -18,3 +18,4 @@ humanoid-ppo-gae:
        num_workers: 64
        num_gpus: 4
        batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
@@ -16,3 +16,4 @@ humanoid-ppo:
        num_workers: 64
        num_gpus: 4
        batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
@@ -11,3 +11,4 @@ cartpole-ppo:
            grid_search: [1, 4]
        sgd_minibatch_size:
            grid_search: [128, 256, 512]
+        observation_fliter: MeanStdFilter
@@ -15,3 +15,4 @@ pendulum-ppo:
        model:
            fcnet_hiddens: [64, 64]
        batch_mode: complete_episodes
+        observation_fliter: MeanStdFilter
@@ -7,3 +7,4 @@ cartpole-ppo:
    config:
        num_workers: 1
        batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
@@ -17,3 +17,4 @@ pendulum-ppo:
        model:
            fcnet_hiddens: [64, 64]
        batch_mode: complete_episodes
+        observation_filter: MeanStdFilter
@@ -10,3 +10,4 @@ walker2d-v1-ppo:
        num_workers: 64
        num_gpus: 4
        batch_mode: complete_episodes
+        observation_filter: MeanStdFilter