[rllib] Upgrade to OpenAI Gym 0.10.3 (#1601)

2026-06-28 02:01:24 +08:00 · 2018-03-06 08:31:02 +00:00
parent 162d063f0d
commit 936bebef99
11 changed files with 63 additions and 49 deletions
@@ -22,7 +22,7 @@ class NoopResetEnv(gym.Wrapper):
        self.override_num_noops = None
        assert env.unwrapped.get_action_meanings()[0] == 'NOOP'

-    def _reset(self):
+    def reset(self):
        """ Do no-op action for a number of steps in [1, noop_max]."""
        self.env.reset()
        if self.override_num_noops is not None:
@@ -46,7 +46,7 @@ class FireResetEnv(gym.Wrapper):
        assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
        assert len(env.unwrapped.get_action_meanings()) >= 3

-    def _reset(self):
+    def reset(self):
        self.env.reset()
        obs, _, done, _ = self.env.step(1)
        if done:
@@ -68,7 +68,7 @@ class EpisodicLifeEnv(gym.Wrapper):
        self.was_real_done = True
        self.was_real_reset = False

-    def _step(self, action):
+    def step(self, action):
        obs, reward, done, info = self.env.step(action)
        self.was_real_done = done
        # check current lives, make loss of life terminal,
@@ -82,7 +82,7 @@ class EpisodicLifeEnv(gym.Wrapper):
        self.lives = lives
        return obs, reward, done, info

-    def _reset(self):
+    def reset(self):
        """Reset only when lives are exhausted.
        This way all states are still reachable even though lives are episodic,
        and the learner need not know about any of this behind-the-scenes.
@@ -106,7 +106,7 @@ class MaxAndSkipEnv(gym.Wrapper):
        self._obs_buffer = deque(maxlen=2)
        self._skip = skip

-    def _step(self, action):
+    def step(self, action):
        total_reward = 0.0
        done = None
        for _ in range(self._skip):
@@ -120,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):

        return max_frame, total_reward, done, info

-    def _reset(self):
+    def reset(self):
        """Clear past frame buffer and init. to first obs. from inner env."""
        self._obs_buffer.clear()
        obs = self.env.reset()
@@ -132,9 +132,10 @@ class MaxAndSkipEnv(gym.Wrapper):
 class ProcessFrame80(gym.ObservationWrapper):
    def __init__(self, env=None):
        super(ProcessFrame80, self).__init__(env)
-        self.observation_space = spaces.Box(low=0, high=255, shape=(80, 80, 1))
+        self.observation_space = spaces.Box(
+            low=0, high=255, shape=(80, 80, 1), dtype=np.uint8)

-    def _observation(self, obs):
+    def observation(self, obs):
        return ProcessFrame80.process(obs)

    @staticmethod
@@ -155,7 +156,7 @@ class ProcessFrame80(gym.ObservationWrapper):


 class ClippedRewardsWrapper(gym.RewardWrapper):
-    def _reward(self, reward):
+    def reward(self, reward):
        """Change all the positive rewards to 1, negative to -1 and keep
        zero."""
        return np.sign(reward)
@@ -195,15 +196,16 @@ class FrameStack(gym.Wrapper):
        self.frames = deque([], maxlen=k)
        shp = env.observation_space.shape
        self.observation_space = spaces.Box(
-            low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
+            low=0, high=255, shape=(shp[0], shp[1], shp[2] * k),
+            dtype=np.uint8)

-    def _reset(self):
+    def reset(self):
        ob = self.env.reset()
        for _ in range(self.k):
            self.frames.append(ob)
        return self._get_ob()

-    def _step(self, action):
+    def step(self, action):
        ob, reward, done, info = self.env.step(action)
        self.frames.append(ob)
        return self._get_ob(), reward, done, info
@@ -20,8 +20,7 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
    If add_noise is True, the rollout will take noisy actions with
    noise drawn from that stream. Otherwise, no action noise will be added.
    """
-    env_timestep_limit = env.spec.tags.get("wrapper_config.TimeLimit"
-                                           ".max_episode_steps")
+    env_timestep_limit = env.spec.max_episode_steps
    timestep_limit = (env_timestep_limit if timestep_limit is None
                      else min(timestep_limit, env_timestep_limit))
    rews = []
@@ -1,4 +1,4 @@
-import math
+from math import cos
 from gym.spaces import Box, Tuple, Discrete
 import numpy as np
 from gym.envs.classic_control.mountain_car import MountainCarEnv
@@ -23,17 +23,17 @@ class MultiAgentMountainCarEnv(MountainCarEnv):

        self.action_space = [Discrete(3) for _ in range(2)]
        self.observation_space = Tuple([
-            Box(self.low, self.high) for _ in range(2)])
+            Box(self.low, self.high, dtype=np.float32) for _ in range(2)])

-        self._seed()
+        self.seed()
        self.reset()

-    def _step(self, action):
+    def step(self, action):
        summed_act = 0.5 * np.sum(action)

        position, velocity = self.state
        velocity += (summed_act - 1) * 0.001
-        velocity += math.cos(3 * position) * (-0.0025)
+        velocity += cos(3 * position) * (-0.0025)
        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
        position += velocity
        position = np.clip(position, self.min_position, self.max_position)
@@ -47,6 +47,6 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
        self.state = (position, velocity)
        return [np.array(self.state) for _ in range(2)], reward, done, {}

-    def _reset(self):
+    def reset(self):
        self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
        return [np.array(self.state) for _ in range(2)]
@@ -22,18 +22,20 @@ class MultiAgentPendulumEnv(PendulumEnv):

        high = np.array([1., 1., self.max_speed])
        self.action_space = [Box(low=-self.max_torque / 2,
-                                 high=self.max_torque / 2, shape=(1,))
+                                 high=self.max_torque / 2,
+                                 shape=(1,),
+                                 dtype=np.float32)
                             for _ in range(2)]
        self.observation_space = Tuple([
-            Box(low=-high, high=high) for _ in range(2)])
+            Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])

-        self._seed()
+        self.seed()

-    def _seed(self, seed=None):
+    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

-    def _step(self, u):
+    def step(self, u):
        th, thdot = self.state  # th := theta

        summed_u = np.sum(u)
@@ -55,7 +57,7 @@ class MultiAgentPendulumEnv(PendulumEnv):
        self.state = np.array([newth, newthdot])
        return self._get_obs(), -costs, False, {}

-    def _reset(self):
+    def reset(self):
        high = np.array([np.pi, 1])
        self.state = self.np_random.uniform(low=-high, high=high)
        self.last_u = None
@@ -266,7 +266,8 @@ class _RLlibPreprocessorWrapper(gym.ObservationWrapper):
        self.preprocessor = preprocessor

        from gym.spaces.box import Box
-        self.observation_space = Box(-1.0, 1.0, preprocessor.shape)
+        self.observation_space = Box(
+            -1.0, 1.0, preprocessor.shape, dtype=np.float32)

-    def _observation(self, observation):
+    def observation(self, observation):
        return self.preprocessor.transform(observation)
@@ -47,7 +47,7 @@ class ModelCatalogTest(unittest.TestCase):
        class TupleEnv(object):
            def __init__(self):
                self.observation_space = Tuple(
-                    [Discrete(5), Box(0, 1, shape=(3,))])
+                    [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
        p1 = ModelCatalog.get_preprocessor(
            get_registry(), TupleEnv())
        self.assertEqual(p1.shape, (8,))
@@ -4,6 +4,7 @@ import traceback
 import gym
 from gym.spaces import Box, Discrete, Tuple
 from gym.envs.registration import EnvSpec
+import numpy as np

 import ray
 from ray.rllib.agent import get_agent_class
@@ -12,19 +13,27 @@ from ray.tune.registry import register_env

 ACTION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5,)),
-    "simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]),
-    "implicit_tuple": [Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))],
+    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+    "simple_tuple": Tuple([
+        Box(0.0, 1.0, (5,), dtype=np.float32),
+        Box(0.0, 1.0, (5,), dtype=np.float32)]),
+    "implicit_tuple": [
+        Box(0.0, 1.0, (5,), dtype=np.float32),
+        Box(0.0, 1.0, (5,), dtype=np.float32)],
 }

 OBSERVATION_SPACES_TO_TEST = {
    "discrete": Discrete(5),
-    "vector": Box(0.0, 1.0, (5,)),
-    "image": Box(0.0, 1.0, (80, 80, 1)),
-    "atari": Box(0.0, 1.0, (210, 160, 3)),
-    "atari_ram": Box(0.0, 1.0, (128,)),
-    "simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]),
-    "mixed_tuple": Tuple([Discrete(10), Box(0.0, 1.0, (5,))]),
+    "vector": Box(0.0, 1.0, (5,), dtype=np.float32),
+    "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
+    "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
+    "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
+    "simple_tuple": Tuple([
+        Box(0.0, 1.0, (5,), dtype=np.float32),
+        Box(0.0, 1.0, (5,), dtype=np.float32)]),
+    "mixed_tuple": Tuple([
+        Discrete(10),
+        Box(0.0, 1.0, (5,), dtype=np.float32)]),
 }

 # (alg, action_space, obs_space)
@@ -59,7 +68,7 @@ def make_stub_env(action_space, obs_space):
        def __init__(self):
            self.action_space = action_space
            self.observation_space = obs_space
-            self._spec = EnvSpec("StubEnv-v0")
+            self.spec = EnvSpec("StubEnv-v0")

        def reset(self):
            sample = self.observation_space.sample()
@@ -196,8 +196,7 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
    """
    last_observation = obs_filter(env.reset())
    try:
-        horizon = horizon if horizon else env.spec.tags.get(
-            "wrapper_config.TimeLimit.max_episode_steps")
+        horizon = horizon if horizon else env.spec.max_episode_steps
    except Exception:
        print("Warning, no horizon specified, assuming infinite")
    if not horizon: