mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 02:01:24 +08:00
[rllib] Upgrade to OpenAI Gym 0.10.3 (#1601)
This commit is contained in:
@@ -22,7 +22,7 @@ class NoopResetEnv(gym.Wrapper):
|
||||
self.override_num_noops = None
|
||||
assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
||||
self.env.reset()
|
||||
if self.override_num_noops is not None:
|
||||
@@ -46,7 +46,7 @@ class FireResetEnv(gym.Wrapper):
|
||||
assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
|
||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
self.env.reset()
|
||||
obs, _, done, _ = self.env.step(1)
|
||||
if done:
|
||||
@@ -68,7 +68,7 @@ class EpisodicLifeEnv(gym.Wrapper):
|
||||
self.was_real_done = True
|
||||
self.was_real_reset = False
|
||||
|
||||
def _step(self, action):
|
||||
def step(self, action):
|
||||
obs, reward, done, info = self.env.step(action)
|
||||
self.was_real_done = done
|
||||
# check current lives, make loss of life terminal,
|
||||
@@ -82,7 +82,7 @@ class EpisodicLifeEnv(gym.Wrapper):
|
||||
self.lives = lives
|
||||
return obs, reward, done, info
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
"""Reset only when lives are exhausted.
|
||||
This way all states are still reachable even though lives are episodic,
|
||||
and the learner need not know about any of this behind-the-scenes.
|
||||
@@ -106,7 +106,7 @@ class MaxAndSkipEnv(gym.Wrapper):
|
||||
self._obs_buffer = deque(maxlen=2)
|
||||
self._skip = skip
|
||||
|
||||
def _step(self, action):
|
||||
def step(self, action):
|
||||
total_reward = 0.0
|
||||
done = None
|
||||
for _ in range(self._skip):
|
||||
@@ -120,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):
|
||||
|
||||
return max_frame, total_reward, done, info
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
"""Clear past frame buffer and init. to first obs. from inner env."""
|
||||
self._obs_buffer.clear()
|
||||
obs = self.env.reset()
|
||||
@@ -132,9 +132,10 @@ class MaxAndSkipEnv(gym.Wrapper):
|
||||
class ProcessFrame80(gym.ObservationWrapper):
|
||||
def __init__(self, env=None):
|
||||
super(ProcessFrame80, self).__init__(env)
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(80, 80, 1))
|
||||
self.observation_space = spaces.Box(
|
||||
low=0, high=255, shape=(80, 80, 1), dtype=np.uint8)
|
||||
|
||||
def _observation(self, obs):
|
||||
def observation(self, obs):
|
||||
return ProcessFrame80.process(obs)
|
||||
|
||||
@staticmethod
|
||||
@@ -155,7 +156,7 @@ class ProcessFrame80(gym.ObservationWrapper):
|
||||
|
||||
|
||||
class ClippedRewardsWrapper(gym.RewardWrapper):
|
||||
def _reward(self, reward):
|
||||
def reward(self, reward):
|
||||
"""Change all the positive rewards to 1, negative to -1 and keep
|
||||
zero."""
|
||||
return np.sign(reward)
|
||||
@@ -195,15 +196,16 @@ class FrameStack(gym.Wrapper):
|
||||
self.frames = deque([], maxlen=k)
|
||||
shp = env.observation_space.shape
|
||||
self.observation_space = spaces.Box(
|
||||
low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
|
||||
low=0, high=255, shape=(shp[0], shp[1], shp[2] * k),
|
||||
dtype=np.uint8)
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
ob = self.env.reset()
|
||||
for _ in range(self.k):
|
||||
self.frames.append(ob)
|
||||
return self._get_ob()
|
||||
|
||||
def _step(self, action):
|
||||
def step(self, action):
|
||||
ob, reward, done, info = self.env.step(action)
|
||||
self.frames.append(ob)
|
||||
return self._get_ob(), reward, done, info
|
||||
|
||||
@@ -20,8 +20,7 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
|
||||
If add_noise is True, the rollout will take noisy actions with
|
||||
noise drawn from that stream. Otherwise, no action noise will be added.
|
||||
"""
|
||||
env_timestep_limit = env.spec.tags.get("wrapper_config.TimeLimit"
|
||||
".max_episode_steps")
|
||||
env_timestep_limit = env.spec.max_episode_steps
|
||||
timestep_limit = (env_timestep_limit if timestep_limit is None
|
||||
else min(timestep_limit, env_timestep_limit))
|
||||
rews = []
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import math
|
||||
from math import cos
|
||||
from gym.spaces import Box, Tuple, Discrete
|
||||
import numpy as np
|
||||
from gym.envs.classic_control.mountain_car import MountainCarEnv
|
||||
@@ -23,17 +23,17 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
|
||||
|
||||
self.action_space = [Discrete(3) for _ in range(2)]
|
||||
self.observation_space = Tuple([
|
||||
Box(self.low, self.high) for _ in range(2)])
|
||||
Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
|
||||
|
||||
self._seed()
|
||||
self.seed()
|
||||
self.reset()
|
||||
|
||||
def _step(self, action):
|
||||
def step(self, action):
|
||||
summed_act = 0.5 * np.sum(action)
|
||||
|
||||
position, velocity = self.state
|
||||
velocity += (summed_act - 1) * 0.001
|
||||
velocity += math.cos(3 * position) * (-0.0025)
|
||||
velocity += cos(3 * position) * (-0.0025)
|
||||
velocity = np.clip(velocity, -self.max_speed, self.max_speed)
|
||||
position += velocity
|
||||
position = np.clip(position, self.min_position, self.max_position)
|
||||
@@ -47,6 +47,6 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
|
||||
self.state = (position, velocity)
|
||||
return [np.array(self.state) for _ in range(2)], reward, done, {}
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
|
||||
return [np.array(self.state) for _ in range(2)]
|
||||
|
||||
@@ -22,18 +22,20 @@ class MultiAgentPendulumEnv(PendulumEnv):
|
||||
|
||||
high = np.array([1., 1., self.max_speed])
|
||||
self.action_space = [Box(low=-self.max_torque / 2,
|
||||
high=self.max_torque / 2, shape=(1,))
|
||||
high=self.max_torque / 2,
|
||||
shape=(1,),
|
||||
dtype=np.float32)
|
||||
for _ in range(2)]
|
||||
self.observation_space = Tuple([
|
||||
Box(low=-high, high=high) for _ in range(2)])
|
||||
Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
|
||||
|
||||
self._seed()
|
||||
self.seed()
|
||||
|
||||
def _seed(self, seed=None):
|
||||
def seed(self, seed=None):
|
||||
self.np_random, seed = seeding.np_random(seed)
|
||||
return [seed]
|
||||
|
||||
def _step(self, u):
|
||||
def step(self, u):
|
||||
th, thdot = self.state # th := theta
|
||||
|
||||
summed_u = np.sum(u)
|
||||
@@ -55,7 +57,7 @@ class MultiAgentPendulumEnv(PendulumEnv):
|
||||
self.state = np.array([newth, newthdot])
|
||||
return self._get_obs(), -costs, False, {}
|
||||
|
||||
def _reset(self):
|
||||
def reset(self):
|
||||
high = np.array([np.pi, 1])
|
||||
self.state = self.np_random.uniform(low=-high, high=high)
|
||||
self.last_u = None
|
||||
|
||||
@@ -266,7 +266,8 @@ class _RLlibPreprocessorWrapper(gym.ObservationWrapper):
|
||||
self.preprocessor = preprocessor
|
||||
|
||||
from gym.spaces.box import Box
|
||||
self.observation_space = Box(-1.0, 1.0, preprocessor.shape)
|
||||
self.observation_space = Box(
|
||||
-1.0, 1.0, preprocessor.shape, dtype=np.float32)
|
||||
|
||||
def _observation(self, observation):
|
||||
def observation(self, observation):
|
||||
return self.preprocessor.transform(observation)
|
||||
|
||||
@@ -47,7 +47,7 @@ class ModelCatalogTest(unittest.TestCase):
|
||||
class TupleEnv(object):
|
||||
def __init__(self):
|
||||
self.observation_space = Tuple(
|
||||
[Discrete(5), Box(0, 1, shape=(3,))])
|
||||
[Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
|
||||
p1 = ModelCatalog.get_preprocessor(
|
||||
get_registry(), TupleEnv())
|
||||
self.assertEqual(p1.shape, (8,))
|
||||
|
||||
@@ -4,6 +4,7 @@ import traceback
|
||||
import gym
|
||||
from gym.spaces import Box, Discrete, Tuple
|
||||
from gym.envs.registration import EnvSpec
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.rllib.agent import get_agent_class
|
||||
@@ -12,19 +13,27 @@ from ray.tune.registry import register_env
|
||||
|
||||
ACTION_SPACES_TO_TEST = {
|
||||
"discrete": Discrete(5),
|
||||
"vector": Box(0.0, 1.0, (5,)),
|
||||
"simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]),
|
||||
"implicit_tuple": [Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))],
|
||||
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
"simple_tuple": Tuple([
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
"implicit_tuple": [
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)],
|
||||
}
|
||||
|
||||
OBSERVATION_SPACES_TO_TEST = {
|
||||
"discrete": Discrete(5),
|
||||
"vector": Box(0.0, 1.0, (5,)),
|
||||
"image": Box(0.0, 1.0, (80, 80, 1)),
|
||||
"atari": Box(0.0, 1.0, (210, 160, 3)),
|
||||
"atari_ram": Box(0.0, 1.0, (128,)),
|
||||
"simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]),
|
||||
"mixed_tuple": Tuple([Discrete(10), Box(0.0, 1.0, (5,))]),
|
||||
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
"image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
|
||||
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
|
||||
"atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
|
||||
"simple_tuple": Tuple([
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
"mixed_tuple": Tuple([
|
||||
Discrete(10),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
}
|
||||
|
||||
# (alg, action_space, obs_space)
|
||||
@@ -59,7 +68,7 @@ def make_stub_env(action_space, obs_space):
|
||||
def __init__(self):
|
||||
self.action_space = action_space
|
||||
self.observation_space = obs_space
|
||||
self._spec = EnvSpec("StubEnv-v0")
|
||||
self.spec = EnvSpec("StubEnv-v0")
|
||||
|
||||
def reset(self):
|
||||
sample = self.observation_space.sample()
|
||||
|
||||
@@ -196,8 +196,7 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
"""
|
||||
last_observation = obs_filter(env.reset())
|
||||
try:
|
||||
horizon = horizon if horizon else env.spec.tags.get(
|
||||
"wrapper_config.TimeLimit.max_episode_steps")
|
||||
horizon = horizon if horizon else env.spec.max_episode_steps
|
||||
except Exception:
|
||||
print("Warning, no horizon specified, assuming infinite")
|
||||
if not horizon:
|
||||
|
||||
Reference in New Issue
Block a user