[rllib] Upgrade to OpenAI Gym 0.10.3 (#1601)

This commit is contained in:
butchcom
2018-03-06 08:31:02 +00:00
committed by Richard Liaw
parent 162d063f0d
commit 936bebef99
11 changed files with 63 additions and 49 deletions
+14 -12
View File
@@ -22,7 +22,7 @@ class NoopResetEnv(gym.Wrapper):
self.override_num_noops = None
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
def _reset(self):
def reset(self):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset()
if self.override_num_noops is not None:
@@ -46,7 +46,7 @@ class FireResetEnv(gym.Wrapper):
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
assert len(env.unwrapped.get_action_meanings()) >= 3
def _reset(self):
def reset(self):
self.env.reset()
obs, _, done, _ = self.env.step(1)
if done:
@@ -68,7 +68,7 @@ class EpisodicLifeEnv(gym.Wrapper):
self.was_real_done = True
self.was_real_reset = False
def _step(self, action):
def step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
@@ -82,7 +82,7 @@ class EpisodicLifeEnv(gym.Wrapper):
self.lives = lives
return obs, reward, done, info
def _reset(self):
def reset(self):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
@@ -106,7 +106,7 @@ class MaxAndSkipEnv(gym.Wrapper):
self._obs_buffer = deque(maxlen=2)
self._skip = skip
def _step(self, action):
def step(self, action):
total_reward = 0.0
done = None
for _ in range(self._skip):
@@ -120,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):
return max_frame, total_reward, done, info
def _reset(self):
def reset(self):
"""Clear past frame buffer and init. to first obs. from inner env."""
self._obs_buffer.clear()
obs = self.env.reset()
@@ -132,9 +132,10 @@ class MaxAndSkipEnv(gym.Wrapper):
class ProcessFrame80(gym.ObservationWrapper):
def __init__(self, env=None):
super(ProcessFrame80, self).__init__(env)
self.observation_space = spaces.Box(low=0, high=255, shape=(80, 80, 1))
self.observation_space = spaces.Box(
low=0, high=255, shape=(80, 80, 1), dtype=np.uint8)
def _observation(self, obs):
def observation(self, obs):
return ProcessFrame80.process(obs)
@staticmethod
@@ -155,7 +156,7 @@ class ProcessFrame80(gym.ObservationWrapper):
class ClippedRewardsWrapper(gym.RewardWrapper):
def _reward(self, reward):
def reward(self, reward):
"""Change all the positive rewards to 1, negative to -1 and keep
zero."""
return np.sign(reward)
@@ -195,15 +196,16 @@ class FrameStack(gym.Wrapper):
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(
low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
low=0, high=255, shape=(shp[0], shp[1], shp[2] * k),
dtype=np.uint8)
def _reset(self):
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def _step(self, action):
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
+1 -2
View File
@@ -20,8 +20,7 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
If add_noise is True, the rollout will take noisy actions with
noise drawn from that stream. Otherwise, no action noise will be added.
"""
env_timestep_limit = env.spec.tags.get("wrapper_config.TimeLimit"
".max_episode_steps")
env_timestep_limit = env.spec.max_episode_steps
timestep_limit = (env_timestep_limit if timestep_limit is None
else min(timestep_limit, env_timestep_limit))
rews = []
@@ -1,4 +1,4 @@
import math
from math import cos
from gym.spaces import Box, Tuple, Discrete
import numpy as np
from gym.envs.classic_control.mountain_car import MountainCarEnv
@@ -23,17 +23,17 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
self.action_space = [Discrete(3) for _ in range(2)]
self.observation_space = Tuple([
Box(self.low, self.high) for _ in range(2)])
Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
self._seed()
self.seed()
self.reset()
def _step(self, action):
def step(self, action):
summed_act = 0.5 * np.sum(action)
position, velocity = self.state
velocity += (summed_act - 1) * 0.001
velocity += math.cos(3 * position) * (-0.0025)
velocity += cos(3 * position) * (-0.0025)
velocity = np.clip(velocity, -self.max_speed, self.max_speed)
position += velocity
position = np.clip(position, self.min_position, self.max_position)
@@ -47,6 +47,6 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
self.state = (position, velocity)
return [np.array(self.state) for _ in range(2)], reward, done, {}
def _reset(self):
def reset(self):
self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
return [np.array(self.state) for _ in range(2)]
@@ -22,18 +22,20 @@ class MultiAgentPendulumEnv(PendulumEnv):
high = np.array([1., 1., self.max_speed])
self.action_space = [Box(low=-self.max_torque / 2,
high=self.max_torque / 2, shape=(1,))
high=self.max_torque / 2,
shape=(1,),
dtype=np.float32)
for _ in range(2)]
self.observation_space = Tuple([
Box(low=-high, high=high) for _ in range(2)])
Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
self._seed()
self.seed()
def _seed(self, seed=None):
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def _step(self, u):
def step(self, u):
th, thdot = self.state # th := theta
summed_u = np.sum(u)
@@ -55,7 +57,7 @@ class MultiAgentPendulumEnv(PendulumEnv):
self.state = np.array([newth, newthdot])
return self._get_obs(), -costs, False, {}
def _reset(self):
def reset(self):
high = np.array([np.pi, 1])
self.state = self.np_random.uniform(low=-high, high=high)
self.last_u = None
+3 -2
View File
@@ -266,7 +266,8 @@ class _RLlibPreprocessorWrapper(gym.ObservationWrapper):
self.preprocessor = preprocessor
from gym.spaces.box import Box
self.observation_space = Box(-1.0, 1.0, preprocessor.shape)
self.observation_space = Box(
-1.0, 1.0, preprocessor.shape, dtype=np.float32)
def _observation(self, observation):
def observation(self, observation):
return self.preprocessor.transform(observation)
+1 -1
View File
@@ -47,7 +47,7 @@ class ModelCatalogTest(unittest.TestCase):
class TupleEnv(object):
def __init__(self):
self.observation_space = Tuple(
[Discrete(5), Box(0, 1, shape=(3,))])
[Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
p1 = ModelCatalog.get_preprocessor(
get_registry(), TupleEnv())
self.assertEqual(p1.shape, (8,))
+19 -10
View File
@@ -4,6 +4,7 @@ import traceback
import gym
from gym.spaces import Box, Discrete, Tuple
from gym.envs.registration import EnvSpec
import numpy as np
import ray
from ray.rllib.agent import get_agent_class
@@ -12,19 +13,27 @@ from ray.tune.registry import register_env
ACTION_SPACES_TO_TEST = {
"discrete": Discrete(5),
"vector": Box(0.0, 1.0, (5,)),
"simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]),
"implicit_tuple": [Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))],
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
"simple_tuple": Tuple([
Box(0.0, 1.0, (5,), dtype=np.float32),
Box(0.0, 1.0, (5,), dtype=np.float32)]),
"implicit_tuple": [
Box(0.0, 1.0, (5,), dtype=np.float32),
Box(0.0, 1.0, (5,), dtype=np.float32)],
}
OBSERVATION_SPACES_TO_TEST = {
"discrete": Discrete(5),
"vector": Box(0.0, 1.0, (5,)),
"image": Box(0.0, 1.0, (80, 80, 1)),
"atari": Box(0.0, 1.0, (210, 160, 3)),
"atari_ram": Box(0.0, 1.0, (128,)),
"simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]),
"mixed_tuple": Tuple([Discrete(10), Box(0.0, 1.0, (5,))]),
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
"image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
"atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
"simple_tuple": Tuple([
Box(0.0, 1.0, (5,), dtype=np.float32),
Box(0.0, 1.0, (5,), dtype=np.float32)]),
"mixed_tuple": Tuple([
Discrete(10),
Box(0.0, 1.0, (5,), dtype=np.float32)]),
}
# (alg, action_space, obs_space)
@@ -59,7 +68,7 @@ def make_stub_env(action_space, obs_space):
def __init__(self):
self.action_space = action_space
self.observation_space = obs_space
self._spec = EnvSpec("StubEnv-v0")
self.spec = EnvSpec("StubEnv-v0")
def reset(self):
sample = self.observation_space.sample()
+1 -2
View File
@@ -196,8 +196,7 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
"""
last_observation = obs_filter(env.reset())
try:
horizon = horizon if horizon else env.spec.tags.get(
"wrapper_config.TimeLimit.max_episode_steps")
horizon = horizon if horizon else env.spec.max_episode_steps
except Exception:
print("Warning, no horizon specified, assuming infinite")
if not horizon: