diff --git a/python/requirements_rllib.txt b/python/requirements_rllib.txt index 0b7543587..94ae9cdbb 100644 --- a/python/requirements_rllib.txt +++ b/python/requirements_rllib.txt @@ -10,3 +10,6 @@ smart_open pybullet # For tests on PettingZoo's multi-agent envs. pettingzoo>=1.4.0 +# For tests on RecSim and Kaggle envs. +recsim +kaggle_environments diff --git a/rllib/BUILD b/rllib/BUILD index 6b6717389..daa623dff 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1067,6 +1067,20 @@ py_test( srcs = ["env/wrappers/tests/test_recsim_wrapper.py"] ) +py_test( + name = "env/wrappers/tests/test_exception_wrapper", + tags = ["env"], + size = "small", + srcs = ["env/wrappers/tests/test_exception_wrapper.py"] +) + +py_test( + name = "env/wrappers/tests/test_group_agents_wrapper", + tags = ["env"], + size = "small", + srcs = ["env/wrappers/tests/test_group_agents_wrapper.py"] +) + # -------------------------------------------------------------------- # Models and Distributions # rllib/models/ diff --git a/rllib/agents/mbmpo/mbmpo.py b/rllib/agents/mbmpo/mbmpo.py index 9cbf94c67..a3731ea9d 100644 --- a/rllib/agents/mbmpo/mbmpo.py +++ b/rllib/agents/mbmpo/mbmpo.py @@ -21,7 +21,7 @@ from ray.rllib.agents.mbmpo.utils import calculate_gae_advantages, \ MBMPOExploration from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.env.env_context import EnvContext -from ray.rllib.env.model_vector_env import model_vector_env +from ray.rllib.env.wrappers.model_vector_env import model_vector_env from ray.rllib.evaluation.metrics import collect_episodes, collect_metrics, \ get_learner_stats from ray.rllib.evaluation.worker_set import WorkerSet diff --git a/rllib/agents/qmix/qmix_policy.py b/rllib/agents/qmix/qmix_policy.py index 539efa2c5..20966aaec 100644 --- a/rllib/agents/qmix/qmix_policy.py +++ b/rllib/agents/qmix/qmix_policy.py @@ -7,6 +7,7 @@ import ray from ray.rllib.agents.qmix.mixers import VDNMixer, QMixer from ray.rllib.agents.qmix.model import RNNModel, _get_size from ray.rllib.env.multi_agent_env import ENV_STATE +from ray.rllib.env.wrappers.group_agents_wrapper import GROUP_REWARDS from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY from ray.rllib.models.torch.torch_action_dist import TorchCategorical from ray.rllib.policy.policy import Policy @@ -14,7 +15,6 @@ from ray.rllib.policy.rnn_sequencing import chop_into_sequences from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.models.catalog import ModelCatalog from ray.rllib.models.modelv2 import _unpack_obs -from ray.rllib.env.constants import GROUP_REWARDS from ray.rllib.utils.framework import try_import_torch from ray.rllib.utils.annotations import override diff --git a/rllib/env/__init__.py b/rllib/env/__init__.py index 48936c40f..61e889e80 100644 --- a/rllib/env/__init__.py +++ b/rllib/env/__init__.py @@ -1,27 +1,34 @@ from ray.rllib.env.base_env import BaseEnv -from ray.rllib.env.dm_env_wrapper import DMEnv -from ray.rllib.env.dm_control_wrapper import DMCEnv -from ray.rllib.env.unity3d_env import Unity3DEnv -from ray.rllib.env.pettingzoo_env import PettingZooEnv -from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.env.env_context import EnvContext from ray.rllib.env.external_env import ExternalEnv from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv -from ray.rllib.env.vector_env import VectorEnv -from ray.rllib.env.env_context import EnvContext +from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.policy_client import PolicyClient from ray.rllib.env.policy_server_input import PolicyServerInput +from ray.rllib.env.remote_vector_env import RemoteVectorEnv +from ray.rllib.env.vector_env import VectorEnv + +from ray.rllib.env.wrappers.dm_env_wrapper import DMEnv +from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv +from ray.rllib.env.wrappers.group_agents_wrapper import GroupAgentsWrapper +from ray.rllib.env.wrappers.kaggle_wrapper import KaggleFootballMultiAgentEnv +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv __all__ = [ "BaseEnv", - "MultiAgentEnv", - "ExternalEnv", - "ExternalMultiAgentEnv", - "VectorEnv", - "EnvContext", "DMEnv", "DMCEnv", - "Unity3DEnv", + "EnvContext", + "ExternalEnv", + "ExternalMultiAgentEnv", + "GroupAgentsWrapper", + "KaggleFootballMultiAgentEnv", + "MultiAgentEnv", "PettingZooEnv", "PolicyClient", "PolicyServerInput", + "RemoteVectorEnv", + "Unity3DEnv", + "VectorEnv", ] diff --git a/rllib/env/atari_wrappers.py b/rllib/env/atari_wrappers.py index 1c2b14c77..31877d82d 100644 --- a/rllib/env/atari_wrappers.py +++ b/rllib/env/atari_wrappers.py @@ -1,319 +1,25 @@ -import numpy as np -from collections import deque -import gym -from gym import spaces -import cv2 -cv2.ocl.setUseOpenCL(False) +from ray.rllib.env.wrappers.atari_wrappers import is_atari, \ + get_wrapper_by_cls, MonitorEnv, NoopResetEnv, ClipRewardEnv, \ + FireResetEnv, EpisodicLifeEnv, MaxAndSkipEnv, WarpFrame, FrameStack, \ + FrameStackTrajectoryView, ScaledFloatFrame, wrap_deepmind +from ray.rllib.utils.deprecation import deprecation_warning +deprecation_warning( + old="ray.rllib.env.atari_wrappers....", + new="ray.rllib.env.wrappers.atari_wrappers....", + error=False, +) -def is_atari(env): - if (hasattr(env.observation_space, "shape") - and env.observation_space.shape is not None - and len(env.observation_space.shape) <= 2): - return False - return hasattr(env, "unwrapped") and hasattr(env.unwrapped, "ale") - - -def get_wrapper_by_cls(env, cls): - """Returns the gym env wrapper of the given class, or None.""" - currentenv = env - while True: - if isinstance(currentenv, cls): - return currentenv - elif isinstance(currentenv, gym.Wrapper): - currentenv = currentenv.env - else: - return None - - -class MonitorEnv(gym.Wrapper): - def __init__(self, env=None): - """Record episodes stats prior to EpisodicLifeEnv, etc.""" - gym.Wrapper.__init__(self, env) - self._current_reward = None - self._num_steps = None - self._total_steps = None - self._episode_rewards = [] - self._episode_lengths = [] - self._num_episodes = 0 - self._num_returned = 0 - - def reset(self, **kwargs): - obs = self.env.reset(**kwargs) - - if self._total_steps is None: - self._total_steps = sum(self._episode_lengths) - - if self._current_reward is not None: - self._episode_rewards.append(self._current_reward) - self._episode_lengths.append(self._num_steps) - self._num_episodes += 1 - - self._current_reward = 0 - self._num_steps = 0 - - return obs - - def step(self, action): - obs, rew, done, info = self.env.step(action) - self._current_reward += rew - self._num_steps += 1 - self._total_steps += 1 - return (obs, rew, done, info) - - def get_episode_rewards(self): - return self._episode_rewards - - def get_episode_lengths(self): - return self._episode_lengths - - def get_total_steps(self): - return self._total_steps - - def next_episode_results(self): - for i in range(self._num_returned, len(self._episode_rewards)): - yield (self._episode_rewards[i], self._episode_lengths[i]) - self._num_returned = len(self._episode_rewards) - - -class NoopResetEnv(gym.Wrapper): - def __init__(self, env, noop_max=30): - """Sample initial states by taking random number of no-ops on reset. - No-op is assumed to be action 0. - """ - gym.Wrapper.__init__(self, env) - self.noop_max = noop_max - self.override_num_noops = None - self.noop_action = 0 - assert env.unwrapped.get_action_meanings()[0] == "NOOP" - - def reset(self, **kwargs): - """ Do no-op action for a number of steps in [1, noop_max].""" - self.env.reset(**kwargs) - if self.override_num_noops is not None: - noops = self.override_num_noops - else: - noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) - assert noops > 0 - obs = None - for _ in range(noops): - obs, _, done, _ = self.env.step(self.noop_action) - if done: - obs = self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) - - -class ClipRewardEnv(gym.RewardWrapper): - def __init__(self, env): - gym.RewardWrapper.__init__(self, env) - - def reward(self, reward): - """Bin reward to {+1, 0, -1} by its sign.""" - return np.sign(reward) - - -class FireResetEnv(gym.Wrapper): - def __init__(self, env): - """Take action on reset. - - For environments that are fixed until firing.""" - gym.Wrapper.__init__(self, env) - assert env.unwrapped.get_action_meanings()[1] == "FIRE" - assert len(env.unwrapped.get_action_meanings()) >= 3 - - def reset(self, **kwargs): - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(1) - if done: - self.env.reset(**kwargs) - obs, _, done, _ = self.env.step(2) - if done: - self.env.reset(**kwargs) - return obs - - def step(self, ac): - return self.env.step(ac) - - -class EpisodicLifeEnv(gym.Wrapper): - def __init__(self, env): - """Make end-of-life == end-of-episode, but only reset on true game over. - Done by DeepMind for the DQN and co. since it helps value estimation. - """ - gym.Wrapper.__init__(self, env) - self.lives = 0 - self.was_real_done = True - - def step(self, action): - obs, reward, done, info = self.env.step(action) - self.was_real_done = done - # check current lives, make loss of life terminal, - # then update lives to handle bonus lives - lives = self.env.unwrapped.ale.lives() - if lives < self.lives and lives > 0: - # for Qbert sometimes we stay in lives == 0 condtion for a few fr - # so its important to keep lives > 0, so that we only reset once - # the environment advertises done. - done = True - self.lives = lives - return obs, reward, done, info - - def reset(self, **kwargs): - """Reset only when lives are exhausted. - This way all states are still reachable even though lives are episodic, - and the learner need not know about any of this behind-the-scenes. - """ - if self.was_real_done: - obs = self.env.reset(**kwargs) - else: - # no-op step to advance from terminal/lost life state - obs, _, _, _ = self.env.step(0) - self.lives = self.env.unwrapped.ale.lives() - return obs - - -class MaxAndSkipEnv(gym.Wrapper): - def __init__(self, env, skip=4): - """Return only every `skip`-th frame""" - gym.Wrapper.__init__(self, env) - # most recent raw observations (for max pooling across time steps) - self._obs_buffer = np.zeros( - (2, ) + env.observation_space.shape, dtype=np.uint8) - self._skip = skip - - def step(self, action): - """Repeat action, sum reward, and max over last observations.""" - total_reward = 0.0 - done = None - for i in range(self._skip): - obs, reward, done, info = self.env.step(action) - if i == self._skip - 2: - self._obs_buffer[0] = obs - if i == self._skip - 1: - self._obs_buffer[1] = obs - total_reward += reward - if done: - break - # Note that the observation on the done=True frame - # doesn't matter - max_frame = self._obs_buffer.max(axis=0) - - return max_frame, total_reward, done, info - - def reset(self, **kwargs): - return self.env.reset(**kwargs) - - -class WarpFrame(gym.ObservationWrapper): - def __init__(self, env, dim): - """Warp frames to the specified size (dim x dim).""" - gym.ObservationWrapper.__init__(self, env) - self.width = dim - self.height = dim - self.observation_space = spaces.Box( - low=0, - high=255, - shape=(self.height, self.width, 1), - dtype=np.uint8) - - def observation(self, frame): - frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) - frame = cv2.resize( - frame, (self.width, self.height), interpolation=cv2.INTER_AREA) - return frame[:, :, None] - - -# TODO: (sven) Deprecated class. Remove once traj. view is the norm. -class FrameStack(gym.Wrapper): - def __init__(self, env, k): - """Stack k last frames.""" - gym.Wrapper.__init__(self, env) - self.k = k - self.frames = deque([], maxlen=k) - shp = env.observation_space.shape - self.observation_space = spaces.Box( - low=0, - high=255, - shape=(shp[0], shp[1], shp[2] * k), - dtype=env.observation_space.dtype) - - def reset(self): - ob = self.env.reset() - for _ in range(self.k): - self.frames.append(ob) - return self._get_ob() - - def step(self, action): - ob, reward, done, info = self.env.step(action) - self.frames.append(ob) - return self._get_ob(), reward, done, info - - def _get_ob(self): - assert len(self.frames) == self.k - return np.concatenate(self.frames, axis=2) - - -class FrameStackTrajectoryView(gym.ObservationWrapper): - def __init__(self, env): - """No stacking. Trajectory View API takes care of this.""" - gym.Wrapper.__init__(self, env) - shp = env.observation_space.shape - assert shp[2] == 1 - self.observation_space = spaces.Box( - low=0, - high=255, - shape=(shp[0], shp[1]), - dtype=env.observation_space.dtype) - - def observation(self, observation): - return np.squeeze(observation, axis=-1) - - -class ScaledFloatFrame(gym.ObservationWrapper): - def __init__(self, env): - gym.ObservationWrapper.__init__(self, env) - self.observation_space = gym.spaces.Box( - low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) - - def observation(self, observation): - # careful! This undoes the memory optimization, use - # with smaller replay buffers only. - return np.array(observation).astype(np.float32) / 255.0 - - -def wrap_deepmind( - env, - dim=84, - # TODO: (sven) Remove once traj. view is norm. - framestack=True, - framestack_via_traj_view_api=False): - """Configure environment for DeepMind-style Atari. - - Note that we assume reward clipping is done outside the wrapper. - - Args: - dim (int): Dimension to resize observations to (dim x dim). - framestack (bool): Whether to framestack observations. - """ - env = MonitorEnv(env) - env = NoopResetEnv(env, noop_max=30) - if env.spec is not None and "NoFrameskip" in env.spec.id: - env = MaxAndSkipEnv(env, skip=4) - env = EpisodicLifeEnv(env) - if "FIRE" in env.unwrapped.get_action_meanings(): - env = FireResetEnv(env) - env = WarpFrame(env, dim) - # env = ScaledFloatFrame(env) # TODO: use for dqn? - # env = ClipRewardEnv(env) # reward clipping is handled by policy eval - # New way of frame stacking via the trajectory view API (model config key: - # `num_framestacks=[int]`. - if framestack_via_traj_view_api: - env = FrameStackTrajectoryView(env) - # Old way (w/o traj. view API) via model config key: `framestack=True`. - # TODO: (sven) Remove once traj. view is norm. - elif framestack is True: - env = FrameStack(env, 4) - return env +is_atari = is_atari +get_wrapper_by_cls = get_wrapper_by_cls +MonitorEnv = MonitorEnv +NoopResetEnv = NoopResetEnv +ClipRewardEnv = ClipRewardEnv +FireResetEnv = FireResetEnv +EpisodicLifeEnv = EpisodicLifeEnv +MaxAndSkipEnv = MaxAndSkipEnv +WarpFrame = WarpFrame +FrameStack = FrameStack +FrameStackTrajectoryView = FrameStackTrajectoryView +ScaledFloatFrame = ScaledFloatFrame +wrap_deepmind = wrap_deepmind diff --git a/rllib/env/constants.py b/rllib/env/constants.py index 45c9b7119..953e92509 100644 --- a/rllib/env/constants.py +++ b/rllib/env/constants.py @@ -1,15 +1,12 @@ -# info key for the individual rewards of an agent, for example: -# info: { -# group_1: { -# _group_rewards: [5, -1, 1], # 3 agents in this group -# } -# } -GROUP_REWARDS = "_group_rewards" +from ray.rllib.env.wrappers.group_agents_wrapper import GROUP_REWARDS as GR, \ + GROUP_INFO as GI +from ray.rllib.utils.deprecation import deprecation_warning -# info key for the individual infos of an agent, for example: -# info: { -# group_1: { -# _group_infos: [{"foo": ...}, {}], # 2 agents in this group -# } -# } -GROUP_INFO = "_group_info" +deprecation_warning( + old="ray.rllib.env.constants.GROUP_[REWARDS|INFO]", + new="ray.rllib.env.wrappers.group_agents_wrapper.GROUP_[REWARDS|INFO]", + error=False, +) + +GROUP_REWARDS = GR +GROUP_INFO = GI diff --git a/rllib/env/dm_control_wrapper.py b/rllib/env/dm_control_wrapper.py index 6734e2a3a..d9fa233e2 100644 --- a/rllib/env/dm_control_wrapper.py +++ b/rllib/env/dm_control_wrapper.py @@ -1,203 +1,10 @@ -""" -DeepMind Control Suite Wrapper directly sourced from: -https://github.com/denisyarats/dmc2gym +from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv as DCE +from ray.rllib.utils.deprecation import deprecation_warning -MIT License +deprecation_warning( + old="ray.rllib.env.dm_control_wrapper.DMCEnv", + new="ray.rllib.env.wrappers.dm_control_wrapper.DMCEnv", + error=False, +) -Copyright (c) 2020 Denis Yarats - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -""" -from gym import core, spaces -try: - from dm_env import specs -except ImportError: - specs = None -try: - from dm_control import suite -except ImportError: - suite = None -import numpy as np - - -def _spec_to_box(spec): - def extract_min_max(s): - assert s.dtype == np.float64 or s.dtype == np.float32 - dim = np.int(np.prod(s.shape)) - if type(s) == specs.Array: - bound = np.inf * np.ones(dim, dtype=np.float32) - return -bound, bound - elif type(s) == specs.BoundedArray: - zeros = np.zeros(dim, dtype=np.float32) - return s.minimum + zeros, s.maximum + zeros - - mins, maxs = [], [] - for s in spec: - mn, mx = extract_min_max(s) - mins.append(mn) - maxs.append(mx) - low = np.concatenate(mins, axis=0) - high = np.concatenate(maxs, axis=0) - assert low.shape == high.shape - return spaces.Box(low, high, dtype=np.float32) - - -def _flatten_obs(obs): - obs_pieces = [] - for v in obs.values(): - flat = np.array([v]) if np.isscalar(v) else v.ravel() - obs_pieces.append(flat) - return np.concatenate(obs_pieces, axis=0) - - -class DMCEnv(core.Env): - def __init__(self, - domain_name, - task_name, - task_kwargs=None, - visualize_reward=False, - from_pixels=False, - height=64, - width=64, - camera_id=0, - frame_skip=2, - environment_kwargs=None, - channels_first=True, - preprocess=True): - self._from_pixels = from_pixels - self._height = height - self._width = width - self._camera_id = camera_id - self._frame_skip = frame_skip - self._channels_first = channels_first - self.preprocess = preprocess - - if specs is None: - raise RuntimeError(( - "The `specs` module from `dm_env` was not imported. Make sure " - "`dm_env` is installed and visible in the current python " - "environment.")) - if suite is None: - raise RuntimeError( - ("The `suite` module from `dm_control` was not imported. Make " - "sure `dm_control` is installed and visible in the current " - "python enviornment.")) - - # create task - self._env = suite.load( - domain_name=domain_name, - task_name=task_name, - task_kwargs=task_kwargs, - visualize_reward=visualize_reward, - environment_kwargs=environment_kwargs) - - # true and normalized action spaces - self._true_action_space = _spec_to_box([self._env.action_spec()]) - self._norm_action_space = spaces.Box( - low=-1.0, - high=1.0, - shape=self._true_action_space.shape, - dtype=np.float32) - - # create observation space - if from_pixels: - shape = [3, height, - width] if channels_first else [height, width, 3] - self._observation_space = spaces.Box( - low=0, high=255, shape=shape, dtype=np.uint8) - if preprocess: - self._observation_space = spaces.Box( - low=-0.5, high=0.5, shape=shape, dtype=np.float32) - else: - self._observation_space = _spec_to_box( - self._env.observation_spec().values()) - - self._state_space = _spec_to_box(self._env.observation_spec().values()) - - self.current_state = None - - def __getattr__(self, name): - return getattr(self._env, name) - - def _get_obs(self, time_step): - if self._from_pixels: - obs = self.render( - height=self._height, - width=self._width, - camera_id=self._camera_id) - if self._channels_first: - obs = obs.transpose(2, 0, 1).copy() - if self.preprocess: - obs = obs / 255.0 - 0.5 - else: - obs = _flatten_obs(time_step.observation) - return obs - - def _convert_action(self, action): - action = action.astype(np.float64) - true_delta = self._true_action_space.high - self._true_action_space.low - norm_delta = self._norm_action_space.high - self._norm_action_space.low - action = (action - self._norm_action_space.low) / norm_delta - action = action * true_delta + self._true_action_space.low - action = action.astype(np.float32) - return action - - @property - def observation_space(self): - return self._observation_space - - @property - def state_space(self): - return self._state_space - - @property - def action_space(self): - return self._norm_action_space - - def step(self, action): - assert self._norm_action_space.contains(action) - action = self._convert_action(action) - assert self._true_action_space.contains(action) - reward = 0 - extra = {"internal_state": self._env.physics.get_state().copy()} - - for _ in range(self._frame_skip): - time_step = self._env.step(action) - reward += time_step.reward or 0 - done = time_step.last() - if done: - break - obs = self._get_obs(time_step) - self.current_state = _flatten_obs(time_step.observation) - extra["discount"] = time_step.discount - return obs, reward, done, extra - - def reset(self): - time_step = self._env.reset() - self.current_state = _flatten_obs(time_step.observation) - obs = self._get_obs(time_step) - return obs - - def render(self, mode="rgb_array", height=None, width=None, camera_id=0): - assert mode == "rgb_array", "only support for rgb_array mode" - height = height or self._height - width = width or self._width - camera_id = camera_id or self._camera_id - return self._env.physics.render( - height=height, width=width, camera_id=camera_id) +DMCEnv = DCE diff --git a/rllib/env/dm_env_wrapper.py b/rllib/env/dm_env_wrapper.py index e402c4cc6..354de1d53 100644 --- a/rllib/env/dm_env_wrapper.py +++ b/rllib/env/dm_env_wrapper.py @@ -1,94 +1,10 @@ -import gym -from gym import spaces +from ray.rllib.env.wrappers.dm_env_wrapper import DMEnv as DE +from ray.rllib.utils.deprecation import deprecation_warning -import numpy as np +deprecation_warning( + old="ray.rllib.env.dm_env_wrapper.DMEnv", + new="ray.rllib.env.wrappers.dm_env_wrapper.DMEnv", + error=False, +) -try: - from dm_env import specs -except ImportError: - specs = None - - -def _convert_spec_to_space(spec): - if isinstance(spec, dict): - return spaces.Dict( - {k: _convert_spec_to_space(v) - for k, v in spec.items()}) - if isinstance(spec, specs.DiscreteArray): - return spaces.Discrete(spec.num_values) - elif isinstance(spec, specs.BoundedArray): - return spaces.Box( - low=np.asscalar(spec.minimum), - high=np.asscalar(spec.maximum), - shape=spec.shape, - dtype=spec.dtype) - elif isinstance(spec, specs.Array): - return spaces.Box( - low=-float("inf"), - high=float("inf"), - shape=spec.shape, - dtype=spec.dtype) - - raise NotImplementedError( - ("Could not convert `Array` spec of type {} to Gym space. " - "Attempted to convert: {}").format(type(spec), spec)) - - -class DMEnv(gym.Env): - """A `gym.Env` wrapper for the `dm_env` API. - """ - - metadata = {"render.modes": ["rgb_array"]} - - def __init__(self, dm_env): - super(DMEnv, self).__init__() - self._env = dm_env - self._prev_obs = None - - if specs is None: - raise RuntimeError(( - "The `specs` module from `dm_env` was not imported. Make sure " - "`dm_env` is installed and visible in the current python " - "environment.")) - - def step(self, action): - ts = self._env.step(action) - - reward = ts.reward - if reward is None: - reward = 0. - - return ts.observation, reward, ts.last(), {"discount": ts.discount} - - def reset(self): - ts = self._env.reset() - return ts.observation - - def render(self, mode="rgb_array"): - if self._prev_obs is None: - raise ValueError( - "Environment not started. Make sure to reset before rendering." - ) - - if mode == "rgb_array": - return self._prev_obs - else: - raise NotImplementedError( - "Render mode '{}' is not supported.".format(mode)) - - @property - def action_space(self): - spec = self._env.action_spec() - return _convert_spec_to_space(spec) - - @property - def observation_space(self): - spec = self._env.observation_spec() - return _convert_spec_to_space(spec) - - @property - def reward_range(self): - spec = self._env.reward_spec() - if isinstance(spec, specs.BoundedArray): - return spec.minimum, spec.maximum - return -float("inf"), float("inf") +DMEnv = DE diff --git a/rllib/env/env_context.py b/rllib/env/env_context.py index c5f9dd62d..637b68317 100644 --- a/rllib/env/env_context.py +++ b/rllib/env/env_context.py @@ -1,3 +1,5 @@ +from typing import Optional + from ray.rllib.utils.annotations import PublicAPI from ray.rllib.utils.typing import EnvConfigDict @@ -24,9 +26,11 @@ class EnvContext(dict): env_config: EnvConfigDict, worker_index: int, vector_index: int = 0, - remote: bool = False): + remote: bool = False, + num_workers: Optional[int] = None): dict.__init__(self, env_config) self.worker_index = worker_index + self.num_workers = num_workers self.vector_index = vector_index self.remote = remote @@ -34,10 +38,12 @@ class EnvContext(dict): env_config: EnvConfigDict = None, worker_index: int = None, vector_index: int = None, - remote: bool = None): + remote: bool = None, + num_workers: Optional[int] = None): return EnvContext( env_config if env_config is not None else self, worker_index if worker_index is not None else self.worker_index, vector_index if vector_index is not None else self.vector_index, remote if remote is not None else self.remote, + num_workers if num_workers is not None else self.num_workers, ) diff --git a/rllib/env/group_agents_wrapper.py b/rllib/env/group_agents_wrapper.py index eab11b8ee..c53d7b938 100644 --- a/rllib/env/group_agents_wrapper.py +++ b/rllib/env/group_agents_wrapper.py @@ -1,103 +1,11 @@ -from collections import OrderedDict +from ray.rllib.env.wrappers.group_agents_wrapper import GroupAgentsWrapper as \ + GAW +from ray.rllib.utils.deprecation import deprecation_warning -from ray.rllib.env.constants import GROUP_REWARDS, GROUP_INFO -from ray.rllib.env.multi_agent_env import MultiAgentEnv +deprecation_warning( + old="ray.rllib.env.group_agents_wrapper._GroupAgentsWrapper", + new="ray.rllib.env.wrappers.group_agents_wrapper.GroupAgentsWrapper", + error=False, +) - -# TODO(ekl) we should add some unit tests for this -class _GroupAgentsWrapper(MultiAgentEnv): - """Wraps a MultiAgentEnv environment with agents grouped as specified. - - See multi_agent_env.py for the specification of groups. - - This API is experimental. - """ - - def __init__(self, env, groups, obs_space=None, act_space=None): - """Wrap an existing multi-agent env to group agents together. - - See MultiAgentEnv.with_agent_groups() for usage info. - - Args: - env (MultiAgentEnv): env to wrap - groups (dict): Grouping spec as documented in MultiAgentEnv - obs_space (Space): Optional observation space for the grouped - env. Must be a tuple space. - act_space (Space): Optional action space for the grouped env. - Must be a tuple space. - """ - - self.env = env - self.groups = groups - self.agent_id_to_group = {} - for group_id, agent_ids in groups.items(): - for agent_id in agent_ids: - if agent_id in self.agent_id_to_group: - raise ValueError( - "Agent id {} is in multiple groups".format( - agent_id, groups)) - self.agent_id_to_group[agent_id] = group_id - if obs_space is not None: - self.observation_space = obs_space - if act_space is not None: - self.action_space = act_space - - def reset(self): - obs = self.env.reset() - return self._group_items(obs) - - def step(self, action_dict): - # Ungroup and send actions - action_dict = self._ungroup_items(action_dict) - obs, rewards, dones, infos = self.env.step(action_dict) - - # Apply grouping transforms to the env outputs - obs = self._group_items(obs) - rewards = self._group_items( - rewards, agg_fn=lambda gvals: list(gvals.values())) - dones = self._group_items( - dones, agg_fn=lambda gvals: all(gvals.values())) - infos = self._group_items( - infos, agg_fn=lambda gvals: {GROUP_INFO: list(gvals.values())}) - - # Aggregate rewards, but preserve the original values in infos - for agent_id, rew in rewards.items(): - if isinstance(rew, list): - rewards[agent_id] = sum(rew) - if agent_id not in infos: - infos[agent_id] = {} - infos[agent_id][GROUP_REWARDS] = rew - - return obs, rewards, dones, infos - - def _ungroup_items(self, items): - out = {} - for agent_id, value in items.items(): - if agent_id in self.groups: - assert len(value) == len(self.groups[agent_id]), \ - (agent_id, value, self.groups) - for a, v in zip(self.groups[agent_id], value): - out[a] = v - else: - out[agent_id] = value - return out - - def _group_items(self, items, agg_fn=lambda gvals: list(gvals.values())): - grouped_items = {} - for agent_id, item in items.items(): - if agent_id in self.agent_id_to_group: - group_id = self.agent_id_to_group[agent_id] - if group_id in grouped_items: - continue # already added - group_out = OrderedDict() - for a in self.groups[group_id]: - if a in items: - group_out[a] = items[a] - else: - raise ValueError( - "Missing member of group {}: {}: {}".format( - group_id, a, items)) - grouped_items[group_id] = agg_fn(group_out) - else: - grouped_items[agent_id] = item - return grouped_items +_GroupAgentsWrapper = GAW diff --git a/rllib/env/model_vector_env.py b/rllib/env/model_vector_env.py index b5b9700d6..4a7378753 100644 --- a/rllib/env/model_vector_env.py +++ b/rllib/env/model_vector_env.py @@ -1,134 +1,10 @@ -import logging -import numpy as np -from gym.spaces import Discrete -from ray.rllib.utils.annotations import override -from ray.rllib.env.vector_env import VectorEnv -from ray.rllib.evaluation.rollout_worker import get_global_worker -from ray.rllib.env.base_env import BaseEnv -from ray.rllib.utils.typing import EnvType +from ray.rllib.env.wrappers.model_vector_env import model_vector_env as mve +from ray.rllib.utils.deprecation import deprecation_warning -logger = logging.getLogger(__name__) +deprecation_warning( + old="ray.rllib.env.model_vector_env.model_vector_env", + new="ray.rllib.env.wrappers.model_vector_env.model_vector_env", + error=False, +) - -def model_vector_env(env: EnvType) -> BaseEnv: - """Returns a VectorizedEnv wrapper around the given environment. - - To obtain worker configs, one can call get_global_worker(). - - Args: - env (EnvType): The input environment (of any supported environment - type) to be convert to a _VectorizedModelGymEnv (wrapped as - an RLlib BaseEnv). - - Returns: - BaseEnv: The BaseEnv converted input `env`. - """ - worker = get_global_worker() - worker_index = worker.worker_index - if worker_index: - env = _VectorizedModelGymEnv( - make_env=worker.make_env_fn, - existing_envs=[env], - num_envs=worker.num_envs, - observation_space=env.observation_space, - action_space=env.action_space, - ) - return BaseEnv.to_base_env( - env, - make_env=worker.make_env_fn, - num_envs=worker.num_envs, - remote_envs=False, - remote_env_batch_wait_ms=0) - - -class _VectorizedModelGymEnv(VectorEnv): - """Vectorized Environment Wrapper for MB-MPO. - - Primary change is in the `vector_step` method, which calls the dynamics - models for next_obs "calculation" (instead of the actual env). Also, the - actual envs need to have two extra methods implemented: `reward(obs)` and - (optionally) `done(obs)`. If `done` is not implemented, we will assume - that episodes in the env do not terminate, ever. - """ - - def __init__(self, - make_env=None, - existing_envs=None, - num_envs=1, - *, - observation_space=None, - action_space=None, - env_config=None): - self.make_env = make_env - self.envs = existing_envs - self.num_envs = num_envs - while len(self.envs) < num_envs: - self.envs.append(self.make_env(len(self.envs))) - - super().__init__( - observation_space=observation_space - or self.envs[0].observation_space, - action_space=action_space or self.envs[0].action_space, - num_envs=num_envs) - worker = get_global_worker() - self.model, self.device = worker.foreach_policy( - lambda x, y: (x.dynamics_model, x.device))[0] - - @override(VectorEnv) - def vector_reset(self): - """Override parent to store actual env obs for upcoming predictions. - """ - self.cur_obs = [e.reset() for e in self.envs] - return self.cur_obs - - @override(VectorEnv) - def reset_at(self, index): - """Override parent to store actual env obs for upcoming predictions. - """ - obs = self.envs[index].reset() - self.cur_obs[index] = obs - return obs - - @override(VectorEnv) - def vector_step(self, actions): - if self.cur_obs is None: - raise ValueError("Need to reset env first") - - # If discrete, need to one-hot actions - if isinstance(self.action_space, Discrete): - act = np.array(actions) - new_act = np.zeros((act.size, act.max() + 1)) - new_act[np.arange(act.size), act] = 1 - actions = new_act.astype("float32") - - # Batch the TD-model prediction. - obs_batch = np.stack(self.cur_obs, axis=0) - action_batch = np.stack(actions, axis=0) - # Predict the next observation, given previous a) real obs - # (after a reset), b) predicted obs (any other time). - next_obs_batch = self.model.predict_model_batches( - obs_batch, action_batch, device=self.device) - next_obs_batch = np.clip(next_obs_batch, -1000, 1000) - - # Call env's reward function. - # Note: Each actual env must implement one to output exact rewards. - rew_batch = self.envs[0].reward(obs_batch, action_batch, - next_obs_batch) - - # If env has a `done` method, use it. - if hasattr(self.envs[0], "done"): - dones_batch = self.envs[0].done(next_obs_batch) - # Otherwise, assume the episode does not end. - else: - dones_batch = np.asarray([False for _ in range(self.num_envs)]) - - info_batch = [{} for _ in range(self.num_envs)] - - self.cur_obs = next_obs_batch - - return list(next_obs_batch), list(rew_batch), list( - dones_batch), info_batch - - @override(VectorEnv) - def get_unwrapped(self): - return self.envs +model_vector_env = mve diff --git a/rllib/env/multi_agent_env.py b/rllib/env/multi_agent_env.py index 1000a6a0d..610aa2e5e 100644 --- a/rllib/env/multi_agent_env.py +++ b/rllib/env/multi_agent_env.py @@ -117,7 +117,69 @@ class MultiAgentEnv: ... }) """ - from ray.rllib.env.group_agents_wrapper import _GroupAgentsWrapper - return _GroupAgentsWrapper(self, groups, obs_space, act_space) + from ray.rllib.env.wrappers.group_agents_wrapper import \ + GroupAgentsWrapper + return GroupAgentsWrapper(self, groups, obs_space, act_space) # __grouping_doc_end__ # yapf: enable + + +def make_multi_agent(env_name_or_creator): + """Convenience wrapper for any sigle-agent env to be converted into MA. + + Agent IDs are int numbers starting from 0 (first agent). + + Args: + env_name_or_creator (Union[str, Callable[]]: String specifier or + env_maker function. + + Returns: + Type[MultiAgentEnv]: New MultiAgentEnv class to be used as env. + The constructor takes a config dict with `num_agents` key + (default=1). The reset of the config dict will be passed on to the + underlying single-agent env's constructor. + + Examples: + >>> # By gym string: + >>> ma_cartpole_cls = make_multi_agent("CartPole-v0") + >>> # Create a 2 agent multi-agent cartpole. + >>> ma_cartpole = ma_cartpole_cls({"num_agents": 2}) + >>> obs = ma_cartpole.reset() + >>> print(obs) + ... {0: [...], 1: [...]} + + >>> # By env-maker callable: + >>> ma_stateless_cartpole_cls = make_multi_agent( + ... lambda config: StatelessCartPole(config)) + >>> # Create a 2 agent multi-agent stateless cartpole. + >>> ma_stateless_cartpole = ma_stateless_cartpole_cls( + ... {"num_agents": 2}) + """ + + class MultiEnv(MultiAgentEnv): + def __init__(self, config): + num = config.pop("num_agents", 1) + if isinstance(env_name_or_creator, str): + self.agents = [ + gym.make(env_name_or_creator) for _ in range(num) + ] + else: + self.agents = [env_name_or_creator(config) for _ in range(num)] + self.dones = set() + self.observation_space = self.agents[0].observation_space + self.action_space = self.agents[0].action_space + + def reset(self): + self.dones = set() + return {i: a.reset() for i, a in enumerate(self.agents)} + + def step(self, action_dict): + obs, rew, done, info = {}, {}, {}, {} + for i, action in action_dict.items(): + obs[i], rew[i], done[i], info[i] = self.agents[i].step(action) + if done[i]: + self.dones.add(i) + done["__all__"] = len(self.dones) == len(self.agents) + return obs, rew, done, info + + return MultiEnv diff --git a/rllib/env/pettingzoo_env.py b/rllib/env/pettingzoo_env.py index 9c45b6224..64ef1668a 100644 --- a/rllib/env/pettingzoo_env.py +++ b/rllib/env/pettingzoo_env.py @@ -1,185 +1,10 @@ -from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv as PE +from ray.rllib.utils.deprecation import deprecation_warning +deprecation_warning( + old="ray.rllib.env.pettingzoo_env.PettingZooEnv", + new="ray.rllib.env.wrappers.pettingzoo_env.PettingZooEnv", + error=False, +) -class PettingZooEnv(MultiAgentEnv): - """An interface to the PettingZoo MARL environment library. - - See: https://github.com/PettingZoo-Team/PettingZoo - - Inherits from MultiAgentEnv and exposes a given AEC - (actor-environment-cycle) game from the PettingZoo project via the - MultiAgentEnv public API. - - Note that the wrapper has some important limitations: - - 1. All agents have the same action_spaces and observation_spaces. - Note: If, within your aec game, agents do not have homogeneous action / - observation spaces, apply SuperSuit wrappers - to apply padding functionality: https://github.com/PettingZoo-Team/ - SuperSuit#built-in-multi-agent-only-functions - 2. Environments are positive sum games (-> Agents are expected to cooperate - to maximize reward). This isn't a hard restriction, it just that - standard algorithms aren't expected to work well in highly competitive - games. - - Examples: - >>> from pettingzoo.butterfly import prison_v2 - >>> env = PettingZooEnv(prison_v2.env()) - >>> obs = env.reset() - >>> print(obs) - # only returns the observation for the agent which should be stepping - { - 'prisoner_0': array([[[0, 0, 0], - [0, 0, 0], - [0, 0, 0], - ..., - [0, 0, 0], - [0, 0, 0], - [0, 0, 0]]], dtype=uint8) - } - >>> obs, rewards, dones, infos = env.step({ - ... "prisoner_0": 1 - ... }) - # only returns the observation, reward, info, etc, for - # the agent who's turn is next. - >>> print(obs) - { - 'prisoner_1': array([[[0, 0, 0], - [0, 0, 0], - [0, 0, 0], - ..., - [0, 0, 0], - [0, 0, 0], - [0, 0, 0]]], dtype=uint8) - } - >>> print(rewards) - { - 'prisoner_1': 0 - } - >>> print(dones) - { - 'prisoner_1': False, '__all__': False - } - >>> print(infos) - { - 'prisoner_1': {'map_tuple': (1, 0)} - } - """ - - def __init__(self, env): - self.env = env - # agent idx list - self.agents = self.env.possible_agents - - # Get dictionaries of obs_spaces and act_spaces - self.observation_spaces = self.env.observation_spaces - self.action_spaces = self.env.action_spaces - - # Get first observation space, assuming all agents have equal space - self.observation_space = self.observation_spaces[self.agents[0]] - - # Get first action space, assuming all agents have equal space - self.action_space = self.action_spaces[self.agents[0]] - - assert all(obs_space == self.observation_space - for obs_space - in self.env.observation_spaces.values()), \ - "Observation spaces for all agents must be identical. Perhaps " \ - "SuperSuit's pad_observations wrapper can help (useage: " \ - "`supersuit.aec_wrappers.pad_observations(env)`" - - assert all(act_space == self.action_space - for act_space in self.env.action_spaces.values()), \ - "Action spaces for all agents must be identical. Perhaps " \ - "SuperSuit's pad_action_space wrapper can help (useage: " \ - "`supersuit.aec_wrappers.pad_action_space(env)`" - - self.reset() - - def reset(self): - self.env.reset() - return { - self.env.agent_selection: self.env.observe( - self.env.agent_selection) - } - - def step(self, action): - self.env.step(action[self.env.agent_selection]) - obs_d = {} - rew_d = {} - done_d = {} - info_d = {} - while self.env.agents: - obs, rew, done, info = self.env.last() - a = self.env.agent_selection - obs_d[a] = obs - rew_d[a] = rew - done_d[a] = done - info_d[a] = info - if self.env.dones[self.env.agent_selection]: - self.env.step(None) - else: - break - - all_done = not self.env.agents - done_d["__all__"] = all_done - - return obs_d, rew_d, done_d, info_d - - def close(self): - self.env.close() - - def seed(self, seed=None): - self.env.seed(seed) - - def render(self, mode="human"): - return self.env.render(mode) - - -class ParallelPettingZooEnv(MultiAgentEnv): - def __init__(self, env): - self.par_env = env - # agent idx list - self.agents = self.par_env.possible_agents - - # Get dictionaries of obs_spaces and act_spaces - self.observation_spaces = self.par_env.observation_spaces - self.action_spaces = self.par_env.action_spaces - - # Get first observation space, assuming all agents have equal space - self.observation_space = self.observation_spaces[self.agents[0]] - - # Get first action space, assuming all agents have equal space - self.action_space = self.action_spaces[self.agents[0]] - - assert all(obs_space == self.observation_space - for obs_space - in self.par_env.observation_spaces.values()), \ - "Observation spaces for all agents must be identical. Perhaps " \ - "SuperSuit's pad_observations wrapper can help (useage: " \ - "`supersuit.aec_wrappers.pad_observations(env)`" - - assert all(act_space == self.action_space - for act_space in self.par_env.action_spaces.values()), \ - "Action spaces for all agents must be identical. Perhaps " \ - "SuperSuit's pad_action_space wrapper can help (useage: " \ - "`supersuit.aec_wrappers.pad_action_space(env)`" - - self.reset() - - def reset(self): - return self.par_env.reset() - - def step(self, action_dict): - obss, rews, dones, infos = self.par_env.step(action_dict) - dones["__all__"] = all(dones.values()) - return obss, rews, dones, infos - - def close(self): - self.par_env.close() - - def seed(self, seed=None): - self.par_env.seed(seed) - - def render(self, mode="human"): - return self.par_env.render(mode) +PettingZooEnv = PE diff --git a/rllib/env/unity3d_env.py b/rllib/env/unity3d_env.py index 753c23443..3326e6c78 100644 --- a/rllib/env/unity3d_env.py +++ b/rllib/env/unity3d_env.py @@ -1,275 +1,10 @@ -from gym.spaces import Box, MultiDiscrete, Tuple as TupleSpace -import logging -import numpy as np -import random -import time -from typing import Callable, Optional, Tuple +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv as UE +from ray.rllib.utils.deprecation import deprecation_warning -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.rllib.utils.annotations import override -from ray.rllib.utils.typing import MultiAgentDict, PolicyID, AgentID +deprecation_warning( + old="ray.rllib.env.unity3d_env.Unity3DEnv", + new="ray.rllib.env.wrappers.unity3d_env.Unity3DEnv", + error=False, +) -logger = logging.getLogger(__name__) - - -class Unity3DEnv(MultiAgentEnv): - """A MultiAgentEnv representing a single Unity3D game instance. - - For an example on how to use this Env with a running Unity3D editor - or with a compiled game, see: - `rllib/examples/unity3d_env_local.py` - For an example on how to use it inside a Unity game client, which - connects to an RLlib Policy server, see: - `rllib/examples/serving/unity3d_[client|server].py` - - Supports all Unity3D (MLAgents) examples, multi- or single-agent and - gets converted automatically into an ExternalMultiAgentEnv, when used - inside an RLlib PolicyClient for cloud/distributed training of Unity games. - """ - - _BASE_PORT = 5004 - - def __init__(self, - file_name: str = None, - port: Optional[int] = None, - seed: int = 0, - no_graphics: bool = False, - timeout_wait: int = 300, - episode_horizon: int = 1000): - """Initializes a Unity3DEnv object. - - Args: - file_name (Optional[str]): Name of the Unity game binary. - If None, will assume a locally running Unity3D editor - to be used, instead. - port (Optional[int]): Port number to connect to Unity environment. - seed (int): A random seed value to use for the Unity3D game. - no_graphics (bool): Whether to run the Unity3D simulator in - no-graphics mode. Default: False. - timeout_wait (int): Time (in seconds) to wait for connection from - the Unity3D instance. - episode_horizon (int): A hard horizon to abide to. After at most - this many steps (per-agent episode `step()` calls), the - Unity3D game is reset and will start again (finishing the - multi-agent episode that the game represents). - Note: The game itself may contain its own episode length - limits, which are always obeyed (on top of this value here). - """ - - super().__init__() - - if file_name is None: - print( - "No game binary provided, will use a running Unity editor " - "instead.\nMake sure you are pressing the Play (|>) button in " - "your editor to start.") - - import mlagents_envs - from mlagents_envs.environment import UnityEnvironment - - # Try connecting to the Unity3D game instance. If a port is blocked - while True: - # Sleep for random time to allow for concurrent startup of many - # environments (num_workers >> 1). Otherwise, would lead to port - # conflicts sometimes. - time.sleep(random.randint(1, 10)) - port_ = port or self._BASE_PORT - self._BASE_PORT += 1 - try: - self.unity_env = UnityEnvironment( - file_name=file_name, - worker_id=0, - base_port=port_, - seed=seed, - no_graphics=no_graphics, - timeout_wait=timeout_wait, - ) - print("Created UnityEnvironment for port {}".format(port_)) - except mlagents_envs.exception.UnityWorkerInUseException: - pass - else: - break - - # Reset entire env every this number of step calls. - self.episode_horizon = episode_horizon - # Keep track of how many times we have called `step` so far. - self.episode_timesteps = 0 - - @override(MultiAgentEnv) - def step( - self, action_dict: MultiAgentDict - ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: - """Performs one multi-agent step through the game. - - Args: - action_dict (dict): Multi-agent action dict with: - keys=agent identifier consisting of - [MLagents behavior name, e.g. "Goalie?team=1"] + "_" + - [Agent index, a unique MLAgent-assigned index per single agent] - - Returns: - tuple: - - obs: Multi-agent observation dict. - Only those observations for which to get new actions are - returned. - - rewards: Rewards dict matching `obs`. - - dones: Done dict with only an __all__ multi-agent entry in - it. __all__=True, if episode is done for all agents. - - infos: An (empty) info dict. - """ - - # Set only the required actions (from the DecisionSteps) in Unity3D. - all_agents = [] - for behavior_name in self.unity_env.behavior_specs: - for agent_id in self.unity_env.get_steps(behavior_name)[ - 0].agent_id_to_index.keys(): - key = behavior_name + "_{}".format(agent_id) - all_agents.append(key) - self.unity_env.set_action_for_agent(behavior_name, agent_id, - action_dict[key]) - # Do the step. - self.unity_env.step() - - obs, rewards, dones, infos = self._get_step_results() - - # Global horizon reached? -> Return __all__ done=True, so user - # can reset. Set all agents' individual `done` to True as well. - self.episode_timesteps += 1 - if self.episode_timesteps > self.episode_horizon: - return obs, rewards, dict({ - "__all__": True - }, **{agent_id: True - for agent_id in all_agents}), infos - - return obs, rewards, dones, infos - - @override(MultiAgentEnv) - def reset(self) -> MultiAgentDict: - """Resets the entire Unity3D scene (a single multi-agent episode).""" - self.episode_timesteps = 0 - self.unity_env.reset() - obs, _, _, _ = self._get_step_results() - return obs - - def _get_step_results(self): - """Collects those agents' obs/rewards that have to act in next `step`. - - Returns: - Tuple: - obs: Multi-agent observation dict. - Only those observations for which to get new actions are - returned. - rewards: Rewards dict matching `obs`. - dones: Done dict with only an __all__ multi-agent entry in it. - __all__=True, if episode is done for all agents. - infos: An (empty) info dict. - """ - obs = {} - rewards = {} - infos = {} - for behavior_name in self.unity_env.behavior_specs: - decision_steps, terminal_steps = self.unity_env.get_steps( - behavior_name) - # Important: Only update those sub-envs that are currently - # available within _env_state. - # Loop through all envs ("agents") and fill in, whatever - # information we have. - for agent_id, idx in decision_steps.agent_id_to_index.items(): - key = behavior_name + "_{}".format(agent_id) - os = tuple(o[idx] for o in decision_steps.obs) - os = os[0] if len(os) == 1 else os - obs[key] = os - rewards[key] = decision_steps.reward[idx] # rewards vector - for agent_id, idx in terminal_steps.agent_id_to_index.items(): - key = behavior_name + "_{}".format(agent_id) - # Only overwrite rewards (last reward in episode), b/c obs - # here is the last obs (which doesn't matter anyways). - # Unless key does not exist in obs. - if key not in obs: - os = tuple(o[idx] for o in terminal_steps.obs) - obs[key] = os = os[0] if len(os) == 1 else os - rewards[key] = terminal_steps.reward[idx] # rewards vector - - # Only use dones if all agents are done, then we should do a reset. - return obs, rewards, {"__all__": False}, infos - - @staticmethod - def get_policy_configs_for_game( - game_name: str) -> Tuple[dict, Callable[[AgentID], PolicyID]]: - - # The RLlib server must know about the Spaces that the Client will be - # using inside Unity3D, up-front. - obs_spaces = { - # 3DBall. - "3DBall": Box(float("-inf"), float("inf"), (8, )), - # 3DBallHard. - "3DBallHard": Box(float("-inf"), float("inf"), (45, )), - # Pyramids. - "Pyramids": TupleSpace([ - Box(float("-inf"), float("inf"), (56, )), - Box(float("-inf"), float("inf"), (56, )), - Box(float("-inf"), float("inf"), (56, )), - Box(float("-inf"), float("inf"), (4, )), - ]), - # SoccerStrikersVsGoalie. - "Goalie": Box(float("-inf"), float("inf"), (738, )), - "Striker": TupleSpace([ - Box(float("-inf"), float("inf"), (231, )), - Box(float("-inf"), float("inf"), (63, )), - ]), - # Tennis. - "Tennis": Box(float("-inf"), float("inf"), (27, )), - # VisualHallway. - "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)), - # Walker. - "Walker": Box(float("-inf"), float("inf"), (212, )), - # FoodCollector. - "FoodCollector": TupleSpace([ - Box(float("-inf"), float("inf"), (49, )), - Box(float("-inf"), float("inf"), (4, )), - ]), - } - action_spaces = { - # 3DBall. - "3DBall": Box( - float("-inf"), float("inf"), (2, ), dtype=np.float32), - # 3DBallHard. - "3DBallHard": Box( - float("-inf"), float("inf"), (2, ), dtype=np.float32), - # Pyramids. - "Pyramids": MultiDiscrete([5]), - # SoccerStrikersVsGoalie. - "Goalie": MultiDiscrete([3, 3, 3]), - "Striker": MultiDiscrete([3, 3, 3]), - # Tennis. - "Tennis": Box(float("-inf"), float("inf"), (3, )), - # VisualHallway. - "VisualHallway": MultiDiscrete([5]), - # Walker. - "Walker": Box(float("-inf"), float("inf"), (39, )), - # FoodCollector. - "FoodCollector": MultiDiscrete([3, 3, 3, 2]), - } - - # Policies (Unity: "behaviors") and agent-to-policy mapping fns. - if game_name == "SoccerStrikersVsGoalie": - policies = { - "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"], - {}), - "Striker": (None, obs_spaces["Striker"], - action_spaces["Striker"], {}), - } - - def policy_mapping_fn(agent_id): - return "Striker" if "Striker" in agent_id else "Goalie" - - else: - policies = { - game_name: (None, obs_spaces[game_name], - action_spaces[game_name], {}), - } - - def policy_mapping_fn(agent_id): - return game_name - - return policies, policy_mapping_fn +Unity3DEnv = UE diff --git a/rllib/env/wrappers/atari_wrappers.py b/rllib/env/wrappers/atari_wrappers.py new file mode 100644 index 000000000..0353484a0 --- /dev/null +++ b/rllib/env/wrappers/atari_wrappers.py @@ -0,0 +1,320 @@ +from collections import deque +import cv2 +import gym +from gym import spaces +import numpy as np + +cv2.ocl.setUseOpenCL(False) + + +def is_atari(env): + if (hasattr(env.observation_space, "shape") + and env.observation_space.shape is not None + and len(env.observation_space.shape) <= 2): + return False + return hasattr(env, "unwrapped") and hasattr(env.unwrapped, "ale") + + +def get_wrapper_by_cls(env, cls): + """Returns the gym env wrapper of the given class, or None.""" + currentenv = env + while True: + if isinstance(currentenv, cls): + return currentenv + elif isinstance(currentenv, gym.Wrapper): + currentenv = currentenv.env + else: + return None + + +class MonitorEnv(gym.Wrapper): + def __init__(self, env=None): + """Record episodes stats prior to EpisodicLifeEnv, etc.""" + gym.Wrapper.__init__(self, env) + self._current_reward = None + self._num_steps = None + self._total_steps = None + self._episode_rewards = [] + self._episode_lengths = [] + self._num_episodes = 0 + self._num_returned = 0 + + def reset(self, **kwargs): + obs = self.env.reset(**kwargs) + + if self._total_steps is None: + self._total_steps = sum(self._episode_lengths) + + if self._current_reward is not None: + self._episode_rewards.append(self._current_reward) + self._episode_lengths.append(self._num_steps) + self._num_episodes += 1 + + self._current_reward = 0 + self._num_steps = 0 + + return obs + + def step(self, action): + obs, rew, done, info = self.env.step(action) + self._current_reward += rew + self._num_steps += 1 + self._total_steps += 1 + return (obs, rew, done, info) + + def get_episode_rewards(self): + return self._episode_rewards + + def get_episode_lengths(self): + return self._episode_lengths + + def get_total_steps(self): + return self._total_steps + + def next_episode_results(self): + for i in range(self._num_returned, len(self._episode_rewards)): + yield (self._episode_rewards[i], self._episode_lengths[i]) + self._num_returned = len(self._episode_rewards) + + +class NoopResetEnv(gym.Wrapper): + def __init__(self, env, noop_max=30): + """Sample initial states by taking random number of no-ops on reset. + No-op is assumed to be action 0. + """ + gym.Wrapper.__init__(self, env) + self.noop_max = noop_max + self.override_num_noops = None + self.noop_action = 0 + assert env.unwrapped.get_action_meanings()[0] == "NOOP" + + def reset(self, **kwargs): + """ Do no-op action for a number of steps in [1, noop_max].""" + self.env.reset(**kwargs) + if self.override_num_noops is not None: + noops = self.override_num_noops + else: + noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) + assert noops > 0 + obs = None + for _ in range(noops): + obs, _, done, _ = self.env.step(self.noop_action) + if done: + obs = self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class ClipRewardEnv(gym.RewardWrapper): + def __init__(self, env): + gym.RewardWrapper.__init__(self, env) + + def reward(self, reward): + """Bin reward to {+1, 0, -1} by its sign.""" + return np.sign(reward) + + +class FireResetEnv(gym.Wrapper): + def __init__(self, env): + """Take action on reset. + + For environments that are fixed until firing.""" + gym.Wrapper.__init__(self, env) + assert env.unwrapped.get_action_meanings()[1] == "FIRE" + assert len(env.unwrapped.get_action_meanings()) >= 3 + + def reset(self, **kwargs): + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(1) + if done: + self.env.reset(**kwargs) + obs, _, done, _ = self.env.step(2) + if done: + self.env.reset(**kwargs) + return obs + + def step(self, ac): + return self.env.step(ac) + + +class EpisodicLifeEnv(gym.Wrapper): + def __init__(self, env): + """Make end-of-life == end-of-episode, but only reset on true game over. + Done by DeepMind for the DQN and co. since it helps value estimation. + """ + gym.Wrapper.__init__(self, env) + self.lives = 0 + self.was_real_done = True + + def step(self, action): + obs, reward, done, info = self.env.step(action) + self.was_real_done = done + # check current lives, make loss of life terminal, + # then update lives to handle bonus lives + lives = self.env.unwrapped.ale.lives() + if lives < self.lives and lives > 0: + # for Qbert sometimes we stay in lives == 0 condtion for a few fr + # so its important to keep lives > 0, so that we only reset once + # the environment advertises done. + done = True + self.lives = lives + return obs, reward, done, info + + def reset(self, **kwargs): + """Reset only when lives are exhausted. + This way all states are still reachable even though lives are episodic, + and the learner need not know about any of this behind-the-scenes. + """ + if self.was_real_done: + obs = self.env.reset(**kwargs) + else: + # no-op step to advance from terminal/lost life state + obs, _, _, _ = self.env.step(0) + self.lives = self.env.unwrapped.ale.lives() + return obs + + +class MaxAndSkipEnv(gym.Wrapper): + def __init__(self, env, skip=4): + """Return only every `skip`-th frame""" + gym.Wrapper.__init__(self, env) + # most recent raw observations (for max pooling across time steps) + self._obs_buffer = np.zeros( + (2, ) + env.observation_space.shape, dtype=np.uint8) + self._skip = skip + + def step(self, action): + """Repeat action, sum reward, and max over last observations.""" + total_reward = 0.0 + done = None + for i in range(self._skip): + obs, reward, done, info = self.env.step(action) + if i == self._skip - 2: + self._obs_buffer[0] = obs + if i == self._skip - 1: + self._obs_buffer[1] = obs + total_reward += reward + if done: + break + # Note that the observation on the done=True frame + # doesn't matter + max_frame = self._obs_buffer.max(axis=0) + + return max_frame, total_reward, done, info + + def reset(self, **kwargs): + return self.env.reset(**kwargs) + + +class WarpFrame(gym.ObservationWrapper): + def __init__(self, env, dim): + """Warp frames to the specified size (dim x dim).""" + gym.ObservationWrapper.__init__(self, env) + self.width = dim + self.height = dim + self.observation_space = spaces.Box( + low=0, + high=255, + shape=(self.height, self.width, 1), + dtype=np.uint8) + + def observation(self, frame): + frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY) + frame = cv2.resize( + frame, (self.width, self.height), interpolation=cv2.INTER_AREA) + return frame[:, :, None] + + +# TODO: (sven) Deprecated class. Remove once traj. view is the norm. +class FrameStack(gym.Wrapper): + def __init__(self, env, k): + """Stack k last frames.""" + gym.Wrapper.__init__(self, env) + self.k = k + self.frames = deque([], maxlen=k) + shp = env.observation_space.shape + self.observation_space = spaces.Box( + low=0, + high=255, + shape=(shp[0], shp[1], shp[2] * k), + dtype=env.observation_space.dtype) + + def reset(self): + ob = self.env.reset() + for _ in range(self.k): + self.frames.append(ob) + return self._get_ob() + + def step(self, action): + ob, reward, done, info = self.env.step(action) + self.frames.append(ob) + return self._get_ob(), reward, done, info + + def _get_ob(self): + assert len(self.frames) == self.k + return np.concatenate(self.frames, axis=2) + + +class FrameStackTrajectoryView(gym.ObservationWrapper): + def __init__(self, env): + """No stacking. Trajectory View API takes care of this.""" + gym.Wrapper.__init__(self, env) + shp = env.observation_space.shape + assert shp[2] == 1 + self.observation_space = spaces.Box( + low=0, + high=255, + shape=(shp[0], shp[1]), + dtype=env.observation_space.dtype) + + def observation(self, observation): + return np.squeeze(observation, axis=-1) + + +class ScaledFloatFrame(gym.ObservationWrapper): + def __init__(self, env): + gym.ObservationWrapper.__init__(self, env) + self.observation_space = gym.spaces.Box( + low=0, high=1, shape=env.observation_space.shape, dtype=np.float32) + + def observation(self, observation): + # careful! This undoes the memory optimization, use + # with smaller replay buffers only. + return np.array(observation).astype(np.float32) / 255.0 + + +def wrap_deepmind( + env, + dim=84, + # TODO: (sven) Remove once traj. view is norm. + framestack=True, + framestack_via_traj_view_api=False): + """Configure environment for DeepMind-style Atari. + + Note that we assume reward clipping is done outside the wrapper. + + Args: + dim (int): Dimension to resize observations to (dim x dim). + framestack (bool): Whether to framestack observations. + """ + env = MonitorEnv(env) + env = NoopResetEnv(env, noop_max=30) + if env.spec is not None and "NoFrameskip" in env.spec.id: + env = MaxAndSkipEnv(env, skip=4) + env = EpisodicLifeEnv(env) + if "FIRE" in env.unwrapped.get_action_meanings(): + env = FireResetEnv(env) + env = WarpFrame(env, dim) + # env = ScaledFloatFrame(env) # TODO: use for dqn? + # env = ClipRewardEnv(env) # reward clipping is handled by policy eval + # New way of frame stacking via the trajectory view API (model config key: + # `num_framestacks=[int]`. + if framestack_via_traj_view_api: + env = FrameStackTrajectoryView(env) + # Old way (w/o traj. view API) via model config key: `framestack=True`. + # TODO: (sven) Remove once traj. view is norm. + elif framestack is True: + env = FrameStack(env, 4) + return env diff --git a/rllib/env/wrappers/dm_control_wrapper.py b/rllib/env/wrappers/dm_control_wrapper.py new file mode 100644 index 000000000..6734e2a3a --- /dev/null +++ b/rllib/env/wrappers/dm_control_wrapper.py @@ -0,0 +1,203 @@ +""" +DeepMind Control Suite Wrapper directly sourced from: +https://github.com/denisyarats/dmc2gym + +MIT License + +Copyright (c) 2020 Denis Yarats + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +from gym import core, spaces +try: + from dm_env import specs +except ImportError: + specs = None +try: + from dm_control import suite +except ImportError: + suite = None +import numpy as np + + +def _spec_to_box(spec): + def extract_min_max(s): + assert s.dtype == np.float64 or s.dtype == np.float32 + dim = np.int(np.prod(s.shape)) + if type(s) == specs.Array: + bound = np.inf * np.ones(dim, dtype=np.float32) + return -bound, bound + elif type(s) == specs.BoundedArray: + zeros = np.zeros(dim, dtype=np.float32) + return s.minimum + zeros, s.maximum + zeros + + mins, maxs = [], [] + for s in spec: + mn, mx = extract_min_max(s) + mins.append(mn) + maxs.append(mx) + low = np.concatenate(mins, axis=0) + high = np.concatenate(maxs, axis=0) + assert low.shape == high.shape + return spaces.Box(low, high, dtype=np.float32) + + +def _flatten_obs(obs): + obs_pieces = [] + for v in obs.values(): + flat = np.array([v]) if np.isscalar(v) else v.ravel() + obs_pieces.append(flat) + return np.concatenate(obs_pieces, axis=0) + + +class DMCEnv(core.Env): + def __init__(self, + domain_name, + task_name, + task_kwargs=None, + visualize_reward=False, + from_pixels=False, + height=64, + width=64, + camera_id=0, + frame_skip=2, + environment_kwargs=None, + channels_first=True, + preprocess=True): + self._from_pixels = from_pixels + self._height = height + self._width = width + self._camera_id = camera_id + self._frame_skip = frame_skip + self._channels_first = channels_first + self.preprocess = preprocess + + if specs is None: + raise RuntimeError(( + "The `specs` module from `dm_env` was not imported. Make sure " + "`dm_env` is installed and visible in the current python " + "environment.")) + if suite is None: + raise RuntimeError( + ("The `suite` module from `dm_control` was not imported. Make " + "sure `dm_control` is installed and visible in the current " + "python enviornment.")) + + # create task + self._env = suite.load( + domain_name=domain_name, + task_name=task_name, + task_kwargs=task_kwargs, + visualize_reward=visualize_reward, + environment_kwargs=environment_kwargs) + + # true and normalized action spaces + self._true_action_space = _spec_to_box([self._env.action_spec()]) + self._norm_action_space = spaces.Box( + low=-1.0, + high=1.0, + shape=self._true_action_space.shape, + dtype=np.float32) + + # create observation space + if from_pixels: + shape = [3, height, + width] if channels_first else [height, width, 3] + self._observation_space = spaces.Box( + low=0, high=255, shape=shape, dtype=np.uint8) + if preprocess: + self._observation_space = spaces.Box( + low=-0.5, high=0.5, shape=shape, dtype=np.float32) + else: + self._observation_space = _spec_to_box( + self._env.observation_spec().values()) + + self._state_space = _spec_to_box(self._env.observation_spec().values()) + + self.current_state = None + + def __getattr__(self, name): + return getattr(self._env, name) + + def _get_obs(self, time_step): + if self._from_pixels: + obs = self.render( + height=self._height, + width=self._width, + camera_id=self._camera_id) + if self._channels_first: + obs = obs.transpose(2, 0, 1).copy() + if self.preprocess: + obs = obs / 255.0 - 0.5 + else: + obs = _flatten_obs(time_step.observation) + return obs + + def _convert_action(self, action): + action = action.astype(np.float64) + true_delta = self._true_action_space.high - self._true_action_space.low + norm_delta = self._norm_action_space.high - self._norm_action_space.low + action = (action - self._norm_action_space.low) / norm_delta + action = action * true_delta + self._true_action_space.low + action = action.astype(np.float32) + return action + + @property + def observation_space(self): + return self._observation_space + + @property + def state_space(self): + return self._state_space + + @property + def action_space(self): + return self._norm_action_space + + def step(self, action): + assert self._norm_action_space.contains(action) + action = self._convert_action(action) + assert self._true_action_space.contains(action) + reward = 0 + extra = {"internal_state": self._env.physics.get_state().copy()} + + for _ in range(self._frame_skip): + time_step = self._env.step(action) + reward += time_step.reward or 0 + done = time_step.last() + if done: + break + obs = self._get_obs(time_step) + self.current_state = _flatten_obs(time_step.observation) + extra["discount"] = time_step.discount + return obs, reward, done, extra + + def reset(self): + time_step = self._env.reset() + self.current_state = _flatten_obs(time_step.observation) + obs = self._get_obs(time_step) + return obs + + def render(self, mode="rgb_array", height=None, width=None, camera_id=0): + assert mode == "rgb_array", "only support for rgb_array mode" + height = height or self._height + width = width or self._width + camera_id = camera_id or self._camera_id + return self._env.physics.render( + height=height, width=width, camera_id=camera_id) diff --git a/rllib/env/wrappers/dm_env_wrapper.py b/rllib/env/wrappers/dm_env_wrapper.py new file mode 100644 index 000000000..e402c4cc6 --- /dev/null +++ b/rllib/env/wrappers/dm_env_wrapper.py @@ -0,0 +1,94 @@ +import gym +from gym import spaces + +import numpy as np + +try: + from dm_env import specs +except ImportError: + specs = None + + +def _convert_spec_to_space(spec): + if isinstance(spec, dict): + return spaces.Dict( + {k: _convert_spec_to_space(v) + for k, v in spec.items()}) + if isinstance(spec, specs.DiscreteArray): + return spaces.Discrete(spec.num_values) + elif isinstance(spec, specs.BoundedArray): + return spaces.Box( + low=np.asscalar(spec.minimum), + high=np.asscalar(spec.maximum), + shape=spec.shape, + dtype=spec.dtype) + elif isinstance(spec, specs.Array): + return spaces.Box( + low=-float("inf"), + high=float("inf"), + shape=spec.shape, + dtype=spec.dtype) + + raise NotImplementedError( + ("Could not convert `Array` spec of type {} to Gym space. " + "Attempted to convert: {}").format(type(spec), spec)) + + +class DMEnv(gym.Env): + """A `gym.Env` wrapper for the `dm_env` API. + """ + + metadata = {"render.modes": ["rgb_array"]} + + def __init__(self, dm_env): + super(DMEnv, self).__init__() + self._env = dm_env + self._prev_obs = None + + if specs is None: + raise RuntimeError(( + "The `specs` module from `dm_env` was not imported. Make sure " + "`dm_env` is installed and visible in the current python " + "environment.")) + + def step(self, action): + ts = self._env.step(action) + + reward = ts.reward + if reward is None: + reward = 0. + + return ts.observation, reward, ts.last(), {"discount": ts.discount} + + def reset(self): + ts = self._env.reset() + return ts.observation + + def render(self, mode="rgb_array"): + if self._prev_obs is None: + raise ValueError( + "Environment not started. Make sure to reset before rendering." + ) + + if mode == "rgb_array": + return self._prev_obs + else: + raise NotImplementedError( + "Render mode '{}' is not supported.".format(mode)) + + @property + def action_space(self): + spec = self._env.action_spec() + return _convert_spec_to_space(spec) + + @property + def observation_space(self): + spec = self._env.observation_spec() + return _convert_spec_to_space(spec) + + @property + def reward_range(self): + spec = self._env.reward_spec() + if isinstance(spec, specs.BoundedArray): + return spec.minimum, spec.maximum + return -float("inf"), float("inf") diff --git a/rllib/env/wrappers/group_agents_wrapper.py b/rllib/env/wrappers/group_agents_wrapper.py new file mode 100644 index 000000000..322bdaf2a --- /dev/null +++ b/rllib/env/wrappers/group_agents_wrapper.py @@ -0,0 +1,117 @@ +from collections import OrderedDict + +from ray.rllib.env.multi_agent_env import MultiAgentEnv + +# info key for the individual rewards of an agent, for example: +# info: { +# group_1: { +# _group_rewards: [5, -1, 1], # 3 agents in this group +# } +# } +GROUP_REWARDS = "_group_rewards" + +# info key for the individual infos of an agent, for example: +# info: { +# group_1: { +# _group_infos: [{"foo": ...}, {}], # 2 agents in this group +# } +# } +GROUP_INFO = "_group_info" + + +class GroupAgentsWrapper(MultiAgentEnv): + """Wraps a MultiAgentEnv environment with agents grouped as specified. + + See multi_agent_env.py for the specification of groups. + + This API is experimental. + """ + + def __init__(self, env, groups, obs_space=None, act_space=None): + """Wrap an existing multi-agent env to group agents together. + + See MultiAgentEnv.with_agent_groups() for usage info. + + Args: + env (MultiAgentEnv): env to wrap + groups (dict): Grouping spec as documented in MultiAgentEnv. + obs_space (Space): Optional observation space for the grouped + env. Must be a tuple space. + act_space (Space): Optional action space for the grouped env. + Must be a tuple space. + """ + + self.env = env + self.groups = groups + self.agent_id_to_group = {} + for group_id, agent_ids in groups.items(): + for agent_id in agent_ids: + if agent_id in self.agent_id_to_group: + raise ValueError( + "Agent id {} is in multiple groups".format( + agent_id, groups)) + self.agent_id_to_group[agent_id] = group_id + if obs_space is not None: + self.observation_space = obs_space + if act_space is not None: + self.action_space = act_space + + def reset(self): + obs = self.env.reset() + return self._group_items(obs) + + def step(self, action_dict): + # Ungroup and send actions + action_dict = self._ungroup_items(action_dict) + obs, rewards, dones, infos = self.env.step(action_dict) + + # Apply grouping transforms to the env outputs + obs = self._group_items(obs) + rewards = self._group_items( + rewards, agg_fn=lambda gvals: list(gvals.values())) + dones = self._group_items( + dones, agg_fn=lambda gvals: all(gvals.values())) + infos = self._group_items( + infos, agg_fn=lambda gvals: {GROUP_INFO: list(gvals.values())}) + + # Aggregate rewards, but preserve the original values in infos + for agent_id, rew in rewards.items(): + if isinstance(rew, list): + rewards[agent_id] = sum(rew) + if agent_id not in infos: + infos[agent_id] = {} + infos[agent_id][GROUP_REWARDS] = rew + + return obs, rewards, dones, infos + + def _ungroup_items(self, items): + out = {} + for agent_id, value in items.items(): + if agent_id in self.groups: + assert len(value) == len(self.groups[agent_id]), \ + (agent_id, value, self.groups) + for a, v in zip(self.groups[agent_id], value): + out[a] = v + else: + out[agent_id] = value + return out + + def _group_items(self, items, agg_fn=lambda gvals: list(gvals.values())): + grouped_items = {} + for agent_id, item in items.items(): + if agent_id in self.agent_id_to_group: + group_id = self.agent_id_to_group[agent_id] + if group_id in grouped_items: + continue # already added + group_out = OrderedDict() + for a in self.groups[group_id]: + if a in items: + group_out[a] = items[a] + else: + raise ValueError( + "Missing member of group {}: {}: {}".format( + group_id, a, items)) + grouped_items[group_id] = agg_fn(group_out) + else: + grouped_items[agent_id] = item + return grouped_items diff --git a/rllib/env/wrappers/kaggle_wrapper.py b/rllib/env/wrappers/kaggle_wrapper.py index 4586aa16a..cd3a2dee6 100644 --- a/rllib/env/wrappers/kaggle_wrapper.py +++ b/rllib/env/wrappers/kaggle_wrapper.py @@ -5,8 +5,10 @@ Source: https://github.com/Kaggle/kaggle-environments from copy import deepcopy from typing import Any, Dict, Optional, Tuple - -import kaggle_environments +try: + import kaggle_environments +except ImportError: + pass import numpy as np from gym.spaces import Box from gym.spaces import Dict as DictSpace diff --git a/rllib/env/wrappers/model_vector_env.py b/rllib/env/wrappers/model_vector_env.py new file mode 100644 index 000000000..b5b9700d6 --- /dev/null +++ b/rllib/env/wrappers/model_vector_env.py @@ -0,0 +1,134 @@ +import logging +import numpy as np +from gym.spaces import Discrete +from ray.rllib.utils.annotations import override +from ray.rllib.env.vector_env import VectorEnv +from ray.rllib.evaluation.rollout_worker import get_global_worker +from ray.rllib.env.base_env import BaseEnv +from ray.rllib.utils.typing import EnvType + +logger = logging.getLogger(__name__) + + +def model_vector_env(env: EnvType) -> BaseEnv: + """Returns a VectorizedEnv wrapper around the given environment. + + To obtain worker configs, one can call get_global_worker(). + + Args: + env (EnvType): The input environment (of any supported environment + type) to be convert to a _VectorizedModelGymEnv (wrapped as + an RLlib BaseEnv). + + Returns: + BaseEnv: The BaseEnv converted input `env`. + """ + worker = get_global_worker() + worker_index = worker.worker_index + if worker_index: + env = _VectorizedModelGymEnv( + make_env=worker.make_env_fn, + existing_envs=[env], + num_envs=worker.num_envs, + observation_space=env.observation_space, + action_space=env.action_space, + ) + return BaseEnv.to_base_env( + env, + make_env=worker.make_env_fn, + num_envs=worker.num_envs, + remote_envs=False, + remote_env_batch_wait_ms=0) + + +class _VectorizedModelGymEnv(VectorEnv): + """Vectorized Environment Wrapper for MB-MPO. + + Primary change is in the `vector_step` method, which calls the dynamics + models for next_obs "calculation" (instead of the actual env). Also, the + actual envs need to have two extra methods implemented: `reward(obs)` and + (optionally) `done(obs)`. If `done` is not implemented, we will assume + that episodes in the env do not terminate, ever. + """ + + def __init__(self, + make_env=None, + existing_envs=None, + num_envs=1, + *, + observation_space=None, + action_space=None, + env_config=None): + self.make_env = make_env + self.envs = existing_envs + self.num_envs = num_envs + while len(self.envs) < num_envs: + self.envs.append(self.make_env(len(self.envs))) + + super().__init__( + observation_space=observation_space + or self.envs[0].observation_space, + action_space=action_space or self.envs[0].action_space, + num_envs=num_envs) + worker = get_global_worker() + self.model, self.device = worker.foreach_policy( + lambda x, y: (x.dynamics_model, x.device))[0] + + @override(VectorEnv) + def vector_reset(self): + """Override parent to store actual env obs for upcoming predictions. + """ + self.cur_obs = [e.reset() for e in self.envs] + return self.cur_obs + + @override(VectorEnv) + def reset_at(self, index): + """Override parent to store actual env obs for upcoming predictions. + """ + obs = self.envs[index].reset() + self.cur_obs[index] = obs + return obs + + @override(VectorEnv) + def vector_step(self, actions): + if self.cur_obs is None: + raise ValueError("Need to reset env first") + + # If discrete, need to one-hot actions + if isinstance(self.action_space, Discrete): + act = np.array(actions) + new_act = np.zeros((act.size, act.max() + 1)) + new_act[np.arange(act.size), act] = 1 + actions = new_act.astype("float32") + + # Batch the TD-model prediction. + obs_batch = np.stack(self.cur_obs, axis=0) + action_batch = np.stack(actions, axis=0) + # Predict the next observation, given previous a) real obs + # (after a reset), b) predicted obs (any other time). + next_obs_batch = self.model.predict_model_batches( + obs_batch, action_batch, device=self.device) + next_obs_batch = np.clip(next_obs_batch, -1000, 1000) + + # Call env's reward function. + # Note: Each actual env must implement one to output exact rewards. + rew_batch = self.envs[0].reward(obs_batch, action_batch, + next_obs_batch) + + # If env has a `done` method, use it. + if hasattr(self.envs[0], "done"): + dones_batch = self.envs[0].done(next_obs_batch) + # Otherwise, assume the episode does not end. + else: + dones_batch = np.asarray([False for _ in range(self.num_envs)]) + + info_batch = [{} for _ in range(self.num_envs)] + + self.cur_obs = next_obs_batch + + return list(next_obs_batch), list(rew_batch), list( + dones_batch), info_batch + + @override(VectorEnv) + def get_unwrapped(self): + return self.envs diff --git a/rllib/env/wrappers/pettingzoo_env.py b/rllib/env/wrappers/pettingzoo_env.py new file mode 100644 index 000000000..9c45b6224 --- /dev/null +++ b/rllib/env/wrappers/pettingzoo_env.py @@ -0,0 +1,185 @@ +from ray.rllib.env.multi_agent_env import MultiAgentEnv + + +class PettingZooEnv(MultiAgentEnv): + """An interface to the PettingZoo MARL environment library. + + See: https://github.com/PettingZoo-Team/PettingZoo + + Inherits from MultiAgentEnv and exposes a given AEC + (actor-environment-cycle) game from the PettingZoo project via the + MultiAgentEnv public API. + + Note that the wrapper has some important limitations: + + 1. All agents have the same action_spaces and observation_spaces. + Note: If, within your aec game, agents do not have homogeneous action / + observation spaces, apply SuperSuit wrappers + to apply padding functionality: https://github.com/PettingZoo-Team/ + SuperSuit#built-in-multi-agent-only-functions + 2. Environments are positive sum games (-> Agents are expected to cooperate + to maximize reward). This isn't a hard restriction, it just that + standard algorithms aren't expected to work well in highly competitive + games. + + Examples: + >>> from pettingzoo.butterfly import prison_v2 + >>> env = PettingZooEnv(prison_v2.env()) + >>> obs = env.reset() + >>> print(obs) + # only returns the observation for the agent which should be stepping + { + 'prisoner_0': array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]], dtype=uint8) + } + >>> obs, rewards, dones, infos = env.step({ + ... "prisoner_0": 1 + ... }) + # only returns the observation, reward, info, etc, for + # the agent who's turn is next. + >>> print(obs) + { + 'prisoner_1': array([[[0, 0, 0], + [0, 0, 0], + [0, 0, 0], + ..., + [0, 0, 0], + [0, 0, 0], + [0, 0, 0]]], dtype=uint8) + } + >>> print(rewards) + { + 'prisoner_1': 0 + } + >>> print(dones) + { + 'prisoner_1': False, '__all__': False + } + >>> print(infos) + { + 'prisoner_1': {'map_tuple': (1, 0)} + } + """ + + def __init__(self, env): + self.env = env + # agent idx list + self.agents = self.env.possible_agents + + # Get dictionaries of obs_spaces and act_spaces + self.observation_spaces = self.env.observation_spaces + self.action_spaces = self.env.action_spaces + + # Get first observation space, assuming all agents have equal space + self.observation_space = self.observation_spaces[self.agents[0]] + + # Get first action space, assuming all agents have equal space + self.action_space = self.action_spaces[self.agents[0]] + + assert all(obs_space == self.observation_space + for obs_space + in self.env.observation_spaces.values()), \ + "Observation spaces for all agents must be identical. Perhaps " \ + "SuperSuit's pad_observations wrapper can help (useage: " \ + "`supersuit.aec_wrappers.pad_observations(env)`" + + assert all(act_space == self.action_space + for act_space in self.env.action_spaces.values()), \ + "Action spaces for all agents must be identical. Perhaps " \ + "SuperSuit's pad_action_space wrapper can help (useage: " \ + "`supersuit.aec_wrappers.pad_action_space(env)`" + + self.reset() + + def reset(self): + self.env.reset() + return { + self.env.agent_selection: self.env.observe( + self.env.agent_selection) + } + + def step(self, action): + self.env.step(action[self.env.agent_selection]) + obs_d = {} + rew_d = {} + done_d = {} + info_d = {} + while self.env.agents: + obs, rew, done, info = self.env.last() + a = self.env.agent_selection + obs_d[a] = obs + rew_d[a] = rew + done_d[a] = done + info_d[a] = info + if self.env.dones[self.env.agent_selection]: + self.env.step(None) + else: + break + + all_done = not self.env.agents + done_d["__all__"] = all_done + + return obs_d, rew_d, done_d, info_d + + def close(self): + self.env.close() + + def seed(self, seed=None): + self.env.seed(seed) + + def render(self, mode="human"): + return self.env.render(mode) + + +class ParallelPettingZooEnv(MultiAgentEnv): + def __init__(self, env): + self.par_env = env + # agent idx list + self.agents = self.par_env.possible_agents + + # Get dictionaries of obs_spaces and act_spaces + self.observation_spaces = self.par_env.observation_spaces + self.action_spaces = self.par_env.action_spaces + + # Get first observation space, assuming all agents have equal space + self.observation_space = self.observation_spaces[self.agents[0]] + + # Get first action space, assuming all agents have equal space + self.action_space = self.action_spaces[self.agents[0]] + + assert all(obs_space == self.observation_space + for obs_space + in self.par_env.observation_spaces.values()), \ + "Observation spaces for all agents must be identical. Perhaps " \ + "SuperSuit's pad_observations wrapper can help (useage: " \ + "`supersuit.aec_wrappers.pad_observations(env)`" + + assert all(act_space == self.action_space + for act_space in self.par_env.action_spaces.values()), \ + "Action spaces for all agents must be identical. Perhaps " \ + "SuperSuit's pad_action_space wrapper can help (useage: " \ + "`supersuit.aec_wrappers.pad_action_space(env)`" + + self.reset() + + def reset(self): + return self.par_env.reset() + + def step(self, action_dict): + obss, rews, dones, infos = self.par_env.step(action_dict) + dones["__all__"] = all(dones.values()) + return obss, rews, dones, infos + + def close(self): + self.par_env.close() + + def seed(self, seed=None): + self.par_env.seed(seed) + + def render(self, mode="human"): + return self.par_env.render(mode) diff --git a/rllib/env/wrappers/recsim_wrapper.py b/rllib/env/wrappers/recsim_wrapper.py index ad846262a..3344cbb6e 100644 --- a/rllib/env/wrappers/recsim_wrapper.py +++ b/rllib/env/wrappers/recsim_wrapper.py @@ -5,12 +5,11 @@ Source: https://github.com/google-research/recsim """ from collections import OrderedDict -from typing import List - import gym -import numpy as np from gym import spaces +import numpy as np from recsim.environments import interest_evolution +from typing import List from ray.rllib.utils.error import UnsupportedSpaceException from ray.tune.registry import register_env diff --git a/rllib/env/wrappers/tests/test_group_agents_wrapper.py b/rllib/env/wrappers/tests/test_group_agents_wrapper.py new file mode 100644 index 000000000..3d5fcba30 --- /dev/null +++ b/rllib/env/wrappers/tests/test_group_agents_wrapper.py @@ -0,0 +1,30 @@ +import unittest + +from ray.rllib.env.wrappers.group_agents_wrapper import GroupAgentsWrapper +from ray.rllib.env.multi_agent_env import make_multi_agent + + +class TestGroupAgentsWrapper(unittest.TestCase): + def test_group_agents_wrapper(self): + MultiAgentCartPole = make_multi_agent("CartPole-v0") + grouped_ma_cartpole = GroupAgentsWrapper( + env=MultiAgentCartPole({ + "num_agents": 4 + }), + groups={ + "group1": [0, 1], + "group2": [2, 3] + }) + obs = grouped_ma_cartpole.reset() + self.assertTrue(len(obs) == 2) + self.assertTrue("group1" in obs and "group2" in obs) + self.assertTrue( + isinstance(obs["group1"], list) and len(obs["group1"]) == 2) + self.assertTrue( + isinstance(obs["group2"], list) and len(obs["group2"]) == 2) + + +if __name__ == "__main__": + import sys + import pytest + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/env/wrappers/tests/test_kaggle_wrapper.py b/rllib/env/wrappers/tests/test_kaggle_wrapper.py index 56300cbc7..3584e29a7 100644 --- a/rllib/env/wrappers/tests/test_kaggle_wrapper.py +++ b/rllib/env/wrappers/tests/test_kaggle_wrapper.py @@ -1,9 +1,7 @@ +from kaggle_environments.utils import structify import unittest -from kaggle_environments.utils import structify - -from ray.rllib.env.wrappers.kaggle_wrapper import \ - KaggleFootballMultiAgentEnv +from ray.rllib.env.wrappers.kaggle_wrapper import KaggleFootballMultiAgentEnv class TestKaggleFootballMultiAgentEnv(unittest.TestCase): diff --git a/rllib/env/wrappers/tests/test_recsim_wrapper.py b/rllib/env/wrappers/tests/test_recsim_wrapper.py index 3211dca71..d1b3e3b1b 100644 --- a/rllib/env/wrappers/tests/test_recsim_wrapper.py +++ b/rllib/env/wrappers/tests/test_recsim_wrapper.py @@ -1,6 +1,5 @@ -import unittest - import gym +import unittest from ray.rllib.env.wrappers.recsim_wrapper import ( MultiDiscreteToDiscreteActionWrapper, make_recsim_env) diff --git a/rllib/env/wrappers/unity3d_env.py b/rllib/env/wrappers/unity3d_env.py new file mode 100644 index 000000000..753c23443 --- /dev/null +++ b/rllib/env/wrappers/unity3d_env.py @@ -0,0 +1,275 @@ +from gym.spaces import Box, MultiDiscrete, Tuple as TupleSpace +import logging +import numpy as np +import random +import time +from typing import Callable, Optional, Tuple + +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.utils.annotations import override +from ray.rllib.utils.typing import MultiAgentDict, PolicyID, AgentID + +logger = logging.getLogger(__name__) + + +class Unity3DEnv(MultiAgentEnv): + """A MultiAgentEnv representing a single Unity3D game instance. + + For an example on how to use this Env with a running Unity3D editor + or with a compiled game, see: + `rllib/examples/unity3d_env_local.py` + For an example on how to use it inside a Unity game client, which + connects to an RLlib Policy server, see: + `rllib/examples/serving/unity3d_[client|server].py` + + Supports all Unity3D (MLAgents) examples, multi- or single-agent and + gets converted automatically into an ExternalMultiAgentEnv, when used + inside an RLlib PolicyClient for cloud/distributed training of Unity games. + """ + + _BASE_PORT = 5004 + + def __init__(self, + file_name: str = None, + port: Optional[int] = None, + seed: int = 0, + no_graphics: bool = False, + timeout_wait: int = 300, + episode_horizon: int = 1000): + """Initializes a Unity3DEnv object. + + Args: + file_name (Optional[str]): Name of the Unity game binary. + If None, will assume a locally running Unity3D editor + to be used, instead. + port (Optional[int]): Port number to connect to Unity environment. + seed (int): A random seed value to use for the Unity3D game. + no_graphics (bool): Whether to run the Unity3D simulator in + no-graphics mode. Default: False. + timeout_wait (int): Time (in seconds) to wait for connection from + the Unity3D instance. + episode_horizon (int): A hard horizon to abide to. After at most + this many steps (per-agent episode `step()` calls), the + Unity3D game is reset and will start again (finishing the + multi-agent episode that the game represents). + Note: The game itself may contain its own episode length + limits, which are always obeyed (on top of this value here). + """ + + super().__init__() + + if file_name is None: + print( + "No game binary provided, will use a running Unity editor " + "instead.\nMake sure you are pressing the Play (|>) button in " + "your editor to start.") + + import mlagents_envs + from mlagents_envs.environment import UnityEnvironment + + # Try connecting to the Unity3D game instance. If a port is blocked + while True: + # Sleep for random time to allow for concurrent startup of many + # environments (num_workers >> 1). Otherwise, would lead to port + # conflicts sometimes. + time.sleep(random.randint(1, 10)) + port_ = port or self._BASE_PORT + self._BASE_PORT += 1 + try: + self.unity_env = UnityEnvironment( + file_name=file_name, + worker_id=0, + base_port=port_, + seed=seed, + no_graphics=no_graphics, + timeout_wait=timeout_wait, + ) + print("Created UnityEnvironment for port {}".format(port_)) + except mlagents_envs.exception.UnityWorkerInUseException: + pass + else: + break + + # Reset entire env every this number of step calls. + self.episode_horizon = episode_horizon + # Keep track of how many times we have called `step` so far. + self.episode_timesteps = 0 + + @override(MultiAgentEnv) + def step( + self, action_dict: MultiAgentDict + ) -> Tuple[MultiAgentDict, MultiAgentDict, MultiAgentDict, MultiAgentDict]: + """Performs one multi-agent step through the game. + + Args: + action_dict (dict): Multi-agent action dict with: + keys=agent identifier consisting of + [MLagents behavior name, e.g. "Goalie?team=1"] + "_" + + [Agent index, a unique MLAgent-assigned index per single agent] + + Returns: + tuple: + - obs: Multi-agent observation dict. + Only those observations for which to get new actions are + returned. + - rewards: Rewards dict matching `obs`. + - dones: Done dict with only an __all__ multi-agent entry in + it. __all__=True, if episode is done for all agents. + - infos: An (empty) info dict. + """ + + # Set only the required actions (from the DecisionSteps) in Unity3D. + all_agents = [] + for behavior_name in self.unity_env.behavior_specs: + for agent_id in self.unity_env.get_steps(behavior_name)[ + 0].agent_id_to_index.keys(): + key = behavior_name + "_{}".format(agent_id) + all_agents.append(key) + self.unity_env.set_action_for_agent(behavior_name, agent_id, + action_dict[key]) + # Do the step. + self.unity_env.step() + + obs, rewards, dones, infos = self._get_step_results() + + # Global horizon reached? -> Return __all__ done=True, so user + # can reset. Set all agents' individual `done` to True as well. + self.episode_timesteps += 1 + if self.episode_timesteps > self.episode_horizon: + return obs, rewards, dict({ + "__all__": True + }, **{agent_id: True + for agent_id in all_agents}), infos + + return obs, rewards, dones, infos + + @override(MultiAgentEnv) + def reset(self) -> MultiAgentDict: + """Resets the entire Unity3D scene (a single multi-agent episode).""" + self.episode_timesteps = 0 + self.unity_env.reset() + obs, _, _, _ = self._get_step_results() + return obs + + def _get_step_results(self): + """Collects those agents' obs/rewards that have to act in next `step`. + + Returns: + Tuple: + obs: Multi-agent observation dict. + Only those observations for which to get new actions are + returned. + rewards: Rewards dict matching `obs`. + dones: Done dict with only an __all__ multi-agent entry in it. + __all__=True, if episode is done for all agents. + infos: An (empty) info dict. + """ + obs = {} + rewards = {} + infos = {} + for behavior_name in self.unity_env.behavior_specs: + decision_steps, terminal_steps = self.unity_env.get_steps( + behavior_name) + # Important: Only update those sub-envs that are currently + # available within _env_state. + # Loop through all envs ("agents") and fill in, whatever + # information we have. + for agent_id, idx in decision_steps.agent_id_to_index.items(): + key = behavior_name + "_{}".format(agent_id) + os = tuple(o[idx] for o in decision_steps.obs) + os = os[0] if len(os) == 1 else os + obs[key] = os + rewards[key] = decision_steps.reward[idx] # rewards vector + for agent_id, idx in terminal_steps.agent_id_to_index.items(): + key = behavior_name + "_{}".format(agent_id) + # Only overwrite rewards (last reward in episode), b/c obs + # here is the last obs (which doesn't matter anyways). + # Unless key does not exist in obs. + if key not in obs: + os = tuple(o[idx] for o in terminal_steps.obs) + obs[key] = os = os[0] if len(os) == 1 else os + rewards[key] = terminal_steps.reward[idx] # rewards vector + + # Only use dones if all agents are done, then we should do a reset. + return obs, rewards, {"__all__": False}, infos + + @staticmethod + def get_policy_configs_for_game( + game_name: str) -> Tuple[dict, Callable[[AgentID], PolicyID]]: + + # The RLlib server must know about the Spaces that the Client will be + # using inside Unity3D, up-front. + obs_spaces = { + # 3DBall. + "3DBall": Box(float("-inf"), float("inf"), (8, )), + # 3DBallHard. + "3DBallHard": Box(float("-inf"), float("inf"), (45, )), + # Pyramids. + "Pyramids": TupleSpace([ + Box(float("-inf"), float("inf"), (56, )), + Box(float("-inf"), float("inf"), (56, )), + Box(float("-inf"), float("inf"), (56, )), + Box(float("-inf"), float("inf"), (4, )), + ]), + # SoccerStrikersVsGoalie. + "Goalie": Box(float("-inf"), float("inf"), (738, )), + "Striker": TupleSpace([ + Box(float("-inf"), float("inf"), (231, )), + Box(float("-inf"), float("inf"), (63, )), + ]), + # Tennis. + "Tennis": Box(float("-inf"), float("inf"), (27, )), + # VisualHallway. + "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)), + # Walker. + "Walker": Box(float("-inf"), float("inf"), (212, )), + # FoodCollector. + "FoodCollector": TupleSpace([ + Box(float("-inf"), float("inf"), (49, )), + Box(float("-inf"), float("inf"), (4, )), + ]), + } + action_spaces = { + # 3DBall. + "3DBall": Box( + float("-inf"), float("inf"), (2, ), dtype=np.float32), + # 3DBallHard. + "3DBallHard": Box( + float("-inf"), float("inf"), (2, ), dtype=np.float32), + # Pyramids. + "Pyramids": MultiDiscrete([5]), + # SoccerStrikersVsGoalie. + "Goalie": MultiDiscrete([3, 3, 3]), + "Striker": MultiDiscrete([3, 3, 3]), + # Tennis. + "Tennis": Box(float("-inf"), float("inf"), (3, )), + # VisualHallway. + "VisualHallway": MultiDiscrete([5]), + # Walker. + "Walker": Box(float("-inf"), float("inf"), (39, )), + # FoodCollector. + "FoodCollector": MultiDiscrete([3, 3, 3, 2]), + } + + # Policies (Unity: "behaviors") and agent-to-policy mapping fns. + if game_name == "SoccerStrikersVsGoalie": + policies = { + "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"], + {}), + "Striker": (None, obs_spaces["Striker"], + action_spaces["Striker"], {}), + } + + def policy_mapping_fn(agent_id): + return "Striker" if "Striker" in agent_id else "Goalie" + + else: + policies = { + game_name: (None, obs_spaces[game_name], + action_spaces[game_name], {}), + } + + def policy_mapping_fn(agent_id): + return game_name + + return policies, policy_mapping_fn diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index 4370ee93a..d0770cdf7 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -9,13 +9,13 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, \ TYPE_CHECKING, Union import ray -from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.env_context import EnvContext from ray.rllib.env.external_env import ExternalEnv from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv from ray.rllib.env.vector_env import VectorEnv +from ray.rllib.env.wrappers.atari_wrappers import wrap_deepmind, is_atari from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler from ray.rllib.evaluation.rollout_metrics import RolloutMetrics from ray.rllib.models import ModelCatalog @@ -343,7 +343,8 @@ class RolloutWorker(ParallelIteratorWorker): elif log_level == "DEBUG": enable_periodic_logging() - env_context = EnvContext(env_config or {}, worker_index) + env_context = EnvContext( + env_config or {}, worker_index, num_workers=num_workers) self.env_context = env_context self.policy_config: TrainerConfigDict = policy_config if callbacks: diff --git a/rllib/evaluation/sampler.py b/rllib/evaluation/sampler.py index 09819cc14..eb81b65de 100644 --- a/rllib/evaluation/sampler.py +++ b/rllib/evaluation/sampler.py @@ -18,7 +18,8 @@ from ray.rllib.evaluation.rollout_metrics import RolloutMetrics from ray.rllib.evaluation.sample_batch_builder import \ MultiAgentSampleBatchBuilder from ray.rllib.env.base_env import BaseEnv, ASYNC_RESET_RETURN -from ray.rllib.env.atari_wrappers import get_wrapper_by_cls, MonitorEnv +from ray.rllib.env.wrappers.atari_wrappers import get_wrapper_by_cls, \ + MonitorEnv from ray.rllib.models.preprocessors import Preprocessor from ray.rllib.offline import InputReader from ray.rllib.policy.policy import clip_action, Policy diff --git a/rllib/examples/env/dm_control_suite.py b/rllib/examples/env/dm_control_suite.py index 165344794..9ba05da3c 100644 --- a/rllib/examples/env/dm_control_suite.py +++ b/rllib/examples/env/dm_control_suite.py @@ -1,4 +1,4 @@ -from ray.rllib.env.dm_control_wrapper import DMCEnv +from ray.rllib.env.wrappers.dm_control_wrapper import DMCEnv """ 8 Environments from Deepmind Control Suite """ diff --git a/rllib/examples/env/multi_agent.py b/rllib/examples/env/multi_agent.py index 096dea205..ded842427 100644 --- a/rllib/examples/env/multi_agent.py +++ b/rllib/examples/env/multi_agent.py @@ -1,38 +1,18 @@ import gym -from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.env.multi_agent_env import MultiAgentEnv, make_multi_agent from ray.rllib.examples.env.mock_env import MockEnv, MockEnv2 from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole +from ray.rllib.utils.deprecation import deprecation_warning def make_multiagent(env_name_or_creator): - class MultiEnv(MultiAgentEnv): - def __init__(self, config): - num = config.pop("num_agents", 1) - if isinstance(env_name_or_creator, str): - self.agents = [ - gym.make(env_name_or_creator) for _ in range(num) - ] - else: - self.agents = [env_name_or_creator(config) for _ in range(num)] - self.dones = set() - self.observation_space = self.agents[0].observation_space - self.action_space = self.agents[0].action_space - - def reset(self): - self.dones = set() - return {i: a.reset() for i, a in enumerate(self.agents)} - - def step(self, action_dict): - obs, rew, done, info = {}, {}, {}, {} - for i, action in action_dict.items(): - obs[i], rew[i], done[i], info[i] = self.agents[i].step(action) - if done[i]: - self.dones.add(i) - done["__all__"] = len(self.dones) == len(self.agents) - return obs, rew, done, info - - return MultiEnv + deprecation_warning( + old="ray.rllib.examples.env.multi_agent.make_multiagent", + new="ray.rllib.env.multi_agent_env.make_multi_agent", + error=False, + ) + return make_multi_agent(env_name_or_creator) class BasicMultiAgent(MultiAgentEnv): @@ -162,8 +142,8 @@ class RoundRobinMultiAgent(MultiAgentEnv): return obs, rew, done, info -MultiAgentCartPole = make_multiagent("CartPole-v0") -MultiAgentMountainCar = make_multiagent("MountainCarContinuous-v0") -MultiAgentPendulum = make_multiagent("Pendulum-v0") -MultiAgentStatelessCartPole = make_multiagent( +MultiAgentCartPole = make_multi_agent("CartPole-v0") +MultiAgentMountainCar = make_multi_agent("MountainCarContinuous-v0") +MultiAgentPendulum = make_multi_agent("Pendulum-v0") +MultiAgentStatelessCartPole = make_multi_agent( lambda config: StatelessCartPole(config)) diff --git a/rllib/examples/multi_agent_independent_learning.py b/rllib/examples/multi_agent_independent_learning.py index 62ce00f41..a929ea2b2 100644 --- a/rllib/examples/multi_agent_independent_learning.py +++ b/rllib/examples/multi_agent_independent_learning.py @@ -1,6 +1,6 @@ from ray import tune from ray.tune.registry import register_env -from ray.rllib.env.pettingzoo_env import PettingZooEnv +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv from pettingzoo.sisl import waterworld_v2 # Based on code from github.com/parametersharingmadrl/parametersharingmadrl diff --git a/rllib/examples/multi_agent_parameter_sharing.py b/rllib/examples/multi_agent_parameter_sharing.py index e53bc45ec..e49d84d11 100644 --- a/rllib/examples/multi_agent_parameter_sharing.py +++ b/rllib/examples/multi_agent_parameter_sharing.py @@ -1,6 +1,6 @@ from ray import tune from ray.tune.registry import register_env -from ray.rllib.env.pettingzoo_env import PettingZooEnv +from ray.rllib.env.wrappers.pettingzoo_env import PettingZooEnv from pettingzoo.sisl import waterworld_v0 # Based on code from github.com/parametersharingmadrl/parametersharingmadrl diff --git a/rllib/examples/serving/unity3d_client.py b/rllib/examples/serving/unity3d_client.py index 4f3784aa7..8c8784ebf 100644 --- a/rllib/examples/serving/unity3d_client.py +++ b/rllib/examples/serving/unity3d_client.py @@ -31,7 +31,7 @@ $ python unity3d_client.py --inference-mode=local --game [path to game binary] import argparse from ray.rllib.env.policy_client import PolicyClient -from ray.rllib.env.unity3d_env import Unity3DEnv +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv SERVER_ADDRESS = "localhost" SERVER_PORT = 9900 diff --git a/rllib/examples/serving/unity3d_server.py b/rllib/examples/serving/unity3d_server.py index 0a39d3e6b..56c1a0089 100755 --- a/rllib/examples/serving/unity3d_server.py +++ b/rllib/examples/serving/unity3d_server.py @@ -34,7 +34,7 @@ import ray from ray.tune import register_env from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.env.policy_server_input import PolicyServerInput -from ray.rllib.env.unity3d_env import Unity3DEnv +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv from ray.rllib.examples.env.random_env import RandomMultiAgentEnv SERVER_ADDRESS = "localhost" diff --git a/rllib/examples/unity3d_env_local.py b/rllib/examples/unity3d_env_local.py index 6ffcc8d2b..7dea67ad6 100644 --- a/rllib/examples/unity3d_env_local.py +++ b/rllib/examples/unity3d_env_local.py @@ -25,7 +25,7 @@ import os import ray from ray import tune -from ray.rllib.env.unity3d_env import Unity3DEnv +from ray.rllib.env.wrappers.unity3d_env import Unity3DEnv from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() diff --git a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml index 5a089f5a6..195bf6340 100644 --- a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml +++ b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml @@ -8,7 +8,7 @@ # goalie's reward (-1 if goal) across all within-scene parallelized playing # fields (8 fields with each 2 strikers + 1 goalie, for the soccer env). unity3d-soccer-strikers-vs-goalie-ppo: - env: ray.rllib.env.unity3d_env.Unity3DEnv + env: ray.rllib.env.wrappers.unity3d_env.Unity3DEnv run: PPO stop: timesteps_total: 1000000