diff --git a/docker/examples/Dockerfile b/docker/examples/Dockerfile index ae69f1d89..c9a964d8a 100644 --- a/docker/examples/Dockerfile +++ b/docker/examples/Dockerfile @@ -3,5 +3,5 @@ FROM ray-project/deploy RUN conda install -y -c conda-forge tensorflow RUN apt-get install -y zlib1g-dev -RUN pip install gym[atari]==0.9.5 opencv-python==3.2.0.8 smart_open +RUN pip install gym[atari] opencv-python==3.2.0.8 smart_open # RUN conda install -y -q pytorch torchvision -c soumith diff --git a/examples/carla/env.py b/examples/carla/env.py index 94cacee75..c88a71b28 100644 --- a/examples/carla/env.py +++ b/examples/carla/env.py @@ -143,21 +143,21 @@ class CarlaEnv(gym.Env): if config["discrete_actions"]: self.action_space = Discrete(len(DISCRETE_ACTIONS)) else: - self.action_space = Box(-1.0, 1.0, shape=(2,)) + self.action_space = Box(-1.0, 1.0, shape=(2,), dtype=np.float32) if config["use_depth_camera"]: image_space = Box( -1.0, 1.0, shape=( config["y_res"], config["x_res"], - 1 * config["framestack"])) + 1 * config["framestack"]), dtype=np.float32) else: image_space = Box( - 0.0, 255.0, shape=( + 0, 255, shape=( config["y_res"], config["x_res"], - 3 * config["framestack"])) - self.observation_space = Tuple( + 3 * config["framestack"]), dtype=np.uint8) + self.observation_space = Tuple( # forward_speed, dist to goal [image_space, Discrete(len(COMMANDS_ENUM)), # next_command - Box(-128.0, 128.0, shape=(2,))]) # forward_speed, dist to goal + Box(-128.0, 128.0, shape=(2,), dtype=np.float32)]) # TODO(ekl) this isn't really a proper gym spec self._spec = lambda: None diff --git a/examples/custom_env/custom_env.py b/examples/custom_env/custom_env.py index e20e88671..87aa757db 100644 --- a/examples/custom_env/custom_env.py +++ b/examples/custom_env/custom_env.py @@ -4,6 +4,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import numpy as np import gym from gym.spaces import Discrete, Box from gym.envs.registration import EnvSpec @@ -22,7 +23,8 @@ class SimpleCorridor(gym.Env): self.end_pos = config["corridor_length"] self.cur_pos = 0 self.action_space = Discrete(2) - self.observation_space = Box(0.0, self.end_pos, shape=(1,)) + self.observation_space = Box( + 0.0, self.end_pos, shape=(1,), dtype=np.float32) self._spec = EnvSpec("SimpleCorridor-{}-v0".format(self.end_pos)) def _reset(self): diff --git a/python/ray/rllib/dqn/common/wrappers.py b/python/ray/rllib/dqn/common/wrappers.py index e5bed0241..9ac859952 100644 --- a/python/ray/rllib/dqn/common/wrappers.py +++ b/python/ray/rllib/dqn/common/wrappers.py @@ -22,7 +22,7 @@ class NoopResetEnv(gym.Wrapper): self.override_num_noops = None assert env.unwrapped.get_action_meanings()[0] == 'NOOP' - def _reset(self): + def reset(self): """ Do no-op action for a number of steps in [1, noop_max].""" self.env.reset() if self.override_num_noops is not None: @@ -46,7 +46,7 @@ class FireResetEnv(gym.Wrapper): assert env.unwrapped.get_action_meanings()[1] == 'FIRE' assert len(env.unwrapped.get_action_meanings()) >= 3 - def _reset(self): + def reset(self): self.env.reset() obs, _, done, _ = self.env.step(1) if done: @@ -68,7 +68,7 @@ class EpisodicLifeEnv(gym.Wrapper): self.was_real_done = True self.was_real_reset = False - def _step(self, action): + def step(self, action): obs, reward, done, info = self.env.step(action) self.was_real_done = done # check current lives, make loss of life terminal, @@ -82,7 +82,7 @@ class EpisodicLifeEnv(gym.Wrapper): self.lives = lives return obs, reward, done, info - def _reset(self): + def reset(self): """Reset only when lives are exhausted. This way all states are still reachable even though lives are episodic, and the learner need not know about any of this behind-the-scenes. @@ -106,7 +106,7 @@ class MaxAndSkipEnv(gym.Wrapper): self._obs_buffer = deque(maxlen=2) self._skip = skip - def _step(self, action): + def step(self, action): total_reward = 0.0 done = None for _ in range(self._skip): @@ -120,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper): return max_frame, total_reward, done, info - def _reset(self): + def reset(self): """Clear past frame buffer and init. to first obs. from inner env.""" self._obs_buffer.clear() obs = self.env.reset() @@ -132,9 +132,10 @@ class MaxAndSkipEnv(gym.Wrapper): class ProcessFrame80(gym.ObservationWrapper): def __init__(self, env=None): super(ProcessFrame80, self).__init__(env) - self.observation_space = spaces.Box(low=0, high=255, shape=(80, 80, 1)) + self.observation_space = spaces.Box( + low=0, high=255, shape=(80, 80, 1), dtype=np.uint8) - def _observation(self, obs): + def observation(self, obs): return ProcessFrame80.process(obs) @staticmethod @@ -155,7 +156,7 @@ class ProcessFrame80(gym.ObservationWrapper): class ClippedRewardsWrapper(gym.RewardWrapper): - def _reward(self, reward): + def reward(self, reward): """Change all the positive rewards to 1, negative to -1 and keep zero.""" return np.sign(reward) @@ -195,15 +196,16 @@ class FrameStack(gym.Wrapper): self.frames = deque([], maxlen=k) shp = env.observation_space.shape self.observation_space = spaces.Box( - low=0, high=255, shape=(shp[0], shp[1], shp[2] * k)) + low=0, high=255, shape=(shp[0], shp[1], shp[2] * k), + dtype=np.uint8) - def _reset(self): + def reset(self): ob = self.env.reset() for _ in range(self.k): self.frames.append(ob) return self._get_ob() - def _step(self, action): + def step(self, action): ob, reward, done, info = self.env.step(action) self.frames.append(ob) return self._get_ob(), reward, done, info diff --git a/python/ray/rllib/es/policies.py b/python/ray/rllib/es/policies.py index 57c74befc..36a404c48 100644 --- a/python/ray/rllib/es/policies.py +++ b/python/ray/rllib/es/policies.py @@ -20,8 +20,7 @@ def rollout(policy, env, timestep_limit=None, add_noise=False): If add_noise is True, the rollout will take noisy actions with noise drawn from that stream. Otherwise, no action noise will be added. """ - env_timestep_limit = env.spec.tags.get("wrapper_config.TimeLimit" - ".max_episode_steps") + env_timestep_limit = env.spec.max_episode_steps timestep_limit = (env_timestep_limit if timestep_limit is None else min(timestep_limit, env_timestep_limit)) rews = [] diff --git a/python/ray/rllib/examples/multiagent_mountaincar_env.py b/python/ray/rllib/examples/multiagent_mountaincar_env.py index d50302eea..d454937ac 100644 --- a/python/ray/rllib/examples/multiagent_mountaincar_env.py +++ b/python/ray/rllib/examples/multiagent_mountaincar_env.py @@ -1,4 +1,4 @@ -import math +from math import cos from gym.spaces import Box, Tuple, Discrete import numpy as np from gym.envs.classic_control.mountain_car import MountainCarEnv @@ -23,17 +23,17 @@ class MultiAgentMountainCarEnv(MountainCarEnv): self.action_space = [Discrete(3) for _ in range(2)] self.observation_space = Tuple([ - Box(self.low, self.high) for _ in range(2)]) + Box(self.low, self.high, dtype=np.float32) for _ in range(2)]) - self._seed() + self.seed() self.reset() - def _step(self, action): + def step(self, action): summed_act = 0.5 * np.sum(action) position, velocity = self.state velocity += (summed_act - 1) * 0.001 - velocity += math.cos(3 * position) * (-0.0025) + velocity += cos(3 * position) * (-0.0025) velocity = np.clip(velocity, -self.max_speed, self.max_speed) position += velocity position = np.clip(position, self.min_position, self.max_position) @@ -47,6 +47,6 @@ class MultiAgentMountainCarEnv(MountainCarEnv): self.state = (position, velocity) return [np.array(self.state) for _ in range(2)], reward, done, {} - def _reset(self): + def reset(self): self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0]) return [np.array(self.state) for _ in range(2)] diff --git a/python/ray/rllib/examples/multiagent_pendulum_env.py b/python/ray/rllib/examples/multiagent_pendulum_env.py index b2095e625..44c86f4e6 100644 --- a/python/ray/rllib/examples/multiagent_pendulum_env.py +++ b/python/ray/rllib/examples/multiagent_pendulum_env.py @@ -22,18 +22,20 @@ class MultiAgentPendulumEnv(PendulumEnv): high = np.array([1., 1., self.max_speed]) self.action_space = [Box(low=-self.max_torque / 2, - high=self.max_torque / 2, shape=(1,)) + high=self.max_torque / 2, + shape=(1,), + dtype=np.float32) for _ in range(2)] self.observation_space = Tuple([ - Box(low=-high, high=high) for _ in range(2)]) + Box(low=-high, high=high, dtype=np.float32) for _ in range(2)]) - self._seed() + self.seed() - def _seed(self, seed=None): + def seed(self, seed=None): self.np_random, seed = seeding.np_random(seed) return [seed] - def _step(self, u): + def step(self, u): th, thdot = self.state # th := theta summed_u = np.sum(u) @@ -55,7 +57,7 @@ class MultiAgentPendulumEnv(PendulumEnv): self.state = np.array([newth, newthdot]) return self._get_obs(), -costs, False, {} - def _reset(self): + def reset(self): high = np.array([np.pi, 1]) self.state = self.np_random.uniform(low=-high, high=high) self.last_u = None diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index 4844bbb98..8a423d309 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -266,7 +266,8 @@ class _RLlibPreprocessorWrapper(gym.ObservationWrapper): self.preprocessor = preprocessor from gym.spaces.box import Box - self.observation_space = Box(-1.0, 1.0, preprocessor.shape) + self.observation_space = Box( + -1.0, 1.0, preprocessor.shape, dtype=np.float32) - def _observation(self, observation): + def observation(self, observation): return self.preprocessor.transform(observation) diff --git a/python/ray/rllib/test/test_catalog.py b/python/ray/rllib/test/test_catalog.py index 5f3ac01f8..c5e503b71 100644 --- a/python/ray/rllib/test/test_catalog.py +++ b/python/ray/rllib/test/test_catalog.py @@ -47,7 +47,7 @@ class ModelCatalogTest(unittest.TestCase): class TupleEnv(object): def __init__(self): self.observation_space = Tuple( - [Discrete(5), Box(0, 1, shape=(3,))]) + [Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)]) p1 = ModelCatalog.get_preprocessor( get_registry(), TupleEnv()) self.assertEqual(p1.shape, (8,)) diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index 109b585f8..bb8c5ecdc 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -4,6 +4,7 @@ import traceback import gym from gym.spaces import Box, Discrete, Tuple from gym.envs.registration import EnvSpec +import numpy as np import ray from ray.rllib.agent import get_agent_class @@ -12,19 +13,27 @@ from ray.tune.registry import register_env ACTION_SPACES_TO_TEST = { "discrete": Discrete(5), - "vector": Box(0.0, 1.0, (5,)), - "simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]), - "implicit_tuple": [Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))], + "vector": Box(0.0, 1.0, (5,), dtype=np.float32), + "simple_tuple": Tuple([ + Box(0.0, 1.0, (5,), dtype=np.float32), + Box(0.0, 1.0, (5,), dtype=np.float32)]), + "implicit_tuple": [ + Box(0.0, 1.0, (5,), dtype=np.float32), + Box(0.0, 1.0, (5,), dtype=np.float32)], } OBSERVATION_SPACES_TO_TEST = { "discrete": Discrete(5), - "vector": Box(0.0, 1.0, (5,)), - "image": Box(0.0, 1.0, (80, 80, 1)), - "atari": Box(0.0, 1.0, (210, 160, 3)), - "atari_ram": Box(0.0, 1.0, (128,)), - "simple_tuple": Tuple([Box(0.0, 1.0, (5,)), Box(0.0, 1.0, (5,))]), - "mixed_tuple": Tuple([Discrete(10), Box(0.0, 1.0, (5,))]), + "vector": Box(0.0, 1.0, (5,), dtype=np.float32), + "image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32), + "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32), + "atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32), + "simple_tuple": Tuple([ + Box(0.0, 1.0, (5,), dtype=np.float32), + Box(0.0, 1.0, (5,), dtype=np.float32)]), + "mixed_tuple": Tuple([ + Discrete(10), + Box(0.0, 1.0, (5,), dtype=np.float32)]), } # (alg, action_space, obs_space) @@ -59,7 +68,7 @@ def make_stub_env(action_space, obs_space): def __init__(self): self.action_space = action_space self.observation_space = obs_space - self._spec = EnvSpec("StubEnv-v0") + self.spec = EnvSpec("StubEnv-v0") def reset(self): sample = self.observation_space.sample() diff --git a/python/ray/rllib/utils/sampler.py b/python/ray/rllib/utils/sampler.py index f62978a95..86be66106 100644 --- a/python/ray/rllib/utils/sampler.py +++ b/python/ray/rllib/utils/sampler.py @@ -196,8 +196,7 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter): """ last_observation = obs_filter(env.reset()) try: - horizon = horizon if horizon else env.spec.tags.get( - "wrapper_config.TimeLimit.max_episode_steps") + horizon = horizon if horizon else env.spec.max_episode_steps except Exception: print("Warning, no horizon specified, assuming infinite") if not horizon: