diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index 09d94730b..cfe8ee90c 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -16,6 +16,7 @@ Algorithm Frameworks Discrete Actions Continuous Actions Multi- `ES`_ tf + torch **Yes** **Yes** No `DDPG`_, `TD3`_ tf + torch No **Yes** **Yes** `APEX-DDPG`_ tf + torch No **Yes** **Yes** +`Dreamer`_ torch No **Yes** No `+RNN`_ `DQN`_, `Rainbow`_ tf + torch **Yes** `+parametric`_ No **Yes** `APEX-DQN`_ tf + torch **Yes** `+parametric`_ No **Yes** `IMPALA`_ tf + torch **Yes** `+parametric`_ **Yes** **Yes** `+RNN`_, `+LSTM auto-wrapping`_, `+Transformer`_, `+autoreg`_ @@ -35,7 +36,7 @@ Algorithm Frameworks Discrete Actions Continuous Actions Multi- .. _`+LSTM auto-wrapping`: rllib-models.html#built-in-models .. _`+parametric`: rllib-models.html#variable-length-parametric-action-spaces .. _`+RNN`: rllib-models.html#recurrent-models -.. _`+Transformer`: rllib-models.html#attention-networks-transformers +.. _`+Transformer`: rllib-models.html#attention-networks .. _`A2C, A3C`: rllib-algorithms.html#a3c .. _`APEX-DQN`: rllib-algorithms.html#apex .. _`APEX-DDPG`: rllib-algorithms.html#apex @@ -304,22 +305,16 @@ SpaceInvaders 650 1001 1025 Policy Gradients ---------------- -|pytorch| |tensorflow| An `implementation `__ of a vanilla policy gradient algorithm for TensorFlow and PyTorch. - -**Papers**: -`[1] - Policy Gradient Methods for Reinforcement Learning with Function Approximation. `__ -and -`[2] - Simple Statistical Gradient-Following Algorithms for Connectionist Reinforcement Learning. `__ - +|pytorch| |tensorflow| +`[paper] `__ `[implementation] `__ We include a vanilla policy gradients implementation as an example algorithm. .. figure:: a2c-arch.svg Policy gradients architecture (same as A2C) -**Tuned examples**: `CartPole-v0 `__ +Tuned examples: `CartPole-v0 `__ -**PG-specific configs**: The following updates will overwrite/be added to the -(base) Trainer config in `rllib/agents/trainer.py `__ (*COMMON_CONFIG* dict): +**PG-specific configs** (see also `common configs `__): .. literalinclude:: ../../rllib/agents/pg/pg.py :language: python @@ -435,6 +430,35 @@ Tuned examples: HalfCheetahRandDirecEnv (`Env `__ `[implementation] `__ + +Dreamer is an image-only model-based RL method that learns by imagining trajectories in the future and is evaluated on the DeepMind Control Suite `environments `__. RLlib's Dreamer is adapted from the `official Google research repo `__. + +To visualize learning, RLLib Dreamer's imagined trajectories are logged as gifs in Tensorboard. Examples of such can be seen `here `__. + +Tuned examples: `Deepmind Control Environments `__ + +**Deepmind Control results @1M steps:** `more details `__ + +============= ============== ====================== +DMC env RLlib Dreamer Danijar et al Dreamer +============= ============== ====================== +Walker-Walk 920 ~930 +Cheetah-Run 640 ~800 +============= ============== ====================== + +**Dreamer-specific configs** (see also `common configs `__): + +.. literalinclude:: ../../rllib/agents/dreamer/dreamer.py + :language: python + :start-after: __sphinx_doc_begin__ + :end-before: __sphinx_doc_end__ + Derivative-free ~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib-toc.rst b/doc/source/rllib-toc.rst index e460305f5..1a22fcda0 100644 --- a/doc/source/rllib-toc.rst +++ b/doc/source/rllib-toc.rst @@ -104,6 +104,8 @@ Algorithms - |pytorch| |tensorflow| :ref:`Deep Deterministic Policy Gradients (DDPG, TD3) ` + - |pytorch| :ref:`Dreamer ` + - |pytorch| |tensorflow| :ref:`Deep Q Networks (DQN, Rainbow, Parametric DQN) ` - |pytorch| |tensorflow| :ref:`Model-Agnostic Meta-Learning (MAML) ` diff --git a/python/ray/tune/logger.py b/python/ray/tune/logger.py index 945c25537..54cf903c4 100644 --- a/python/ray/tune/logger.py +++ b/python/ray/tune/logger.py @@ -227,6 +227,13 @@ class TBXLogger(Logger): and len(value) > 0) or (type(value) == np.ndarray and value.size > 0): valid_result[full_attr] = value + + # Must be video + if type(value) == np.ndarray and value.ndim == 5: + self._file_writer.add_video( + full_attr, value, global_step=step, fps=20) + continue + try: self._file_writer.add_histogram( full_attr, value, global_step=step) diff --git a/rllib/agents/dreamer/__init__.py b/rllib/agents/dreamer/__init__.py new file mode 100644 index 000000000..c71cd58cc --- /dev/null +++ b/rllib/agents/dreamer/__init__.py @@ -0,0 +1,6 @@ +from ray.rllib.agents.dreamer.dreamer import DREAMERTrainer, DEFAULT_CONFIG + +__all__ = [ + "DREAMERTrainer", + "DEFAULT_CONFIG", +] diff --git a/rllib/agents/dreamer/dreamer.py b/rllib/agents/dreamer/dreamer.py new file mode 100644 index 000000000..4543d1401 --- /dev/null +++ b/rllib/agents/dreamer/dreamer.py @@ -0,0 +1,267 @@ +import logging + +import random +import numpy as np + +from ray.rllib.agents import with_common_config +from ray.rllib.agents.dreamer.dreamer_torch_policy import DreamerTorchPolicy +from ray.rllib.agents.trainer_template import build_trainer +from ray.rllib.execution.common import STEPS_SAMPLED_COUNTER, \ + LEARNER_INFO, _get_shared_metrics +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID +from ray.rllib.evaluation.metrics import collect_metrics +from ray.rllib.agents.dreamer.dreamer_model import DreamerModel +from ray.rllib.execution.rollout_ops import ParallelRollouts +from ray.rllib.utils.typing import SampleBatchType + +logger = logging.getLogger(__name__) + +# yapf: disable +# __sphinx_doc_begin__ +DEFAULT_CONFIG = with_common_config({ + # PlaNET Model LR + "td_model_lr": 6e-4, + # Actor LR + "actor_lr": 8e-5, + # Critic LR + "critic_lr": 8e-5, + # Grad Clipping + "grad_clip": 100.0, + # Discount + "discount": 0.99, + # Lambda + "lambda": 0.95, + # Training iterations per data collection from real env + "dreamer_train_iters": 100, + # Horizon for Enviornment (1000 for Mujoco/DMC) + "horizon": 1000, + # Number of episodes to sample for Loss Calculation + "batch_size": 50, + # Length of each episode to sample for Loss Calculation + "batch_length": 50, + # Imagination Horizon for Training Actor and Critic + "imagine_horizon": 15, + # Free Nats + "free_nats": 3.0, + # KL Coeff for the Model Loss + "kl_coeff": 1.0, + # Distributed Dreamer not implemented yet + "num_workers": 0, + # Prefill Timesteps + "prefill_timesteps": 5000, + # This should be kept at 1 to preserve sample efficiency + "num_envs_per_worker": 1, + # Exploration Gaussian + "explore_noise": 0.3, + # Batch mode + "batch_mode": "complete_episodes", + # Custom Model + "dreamer_model": { + "custom_model": DreamerModel, + # RSSM/PlaNET parameters + "deter_size": 200, + "stoch_size": 30, + # CNN Decoder Encoder + "depth_size": 32, + # General Network Parameters + "hidden_size": 400, + # Action STD + "action_init_std": 5.0, + }, + + "env_config": { + # Repeats action send by policy for frame_skip times in env + "frame_skip": 2, + } +}) +# __sphinx_doc_end__ +# yapf: enable + + +class EpisodicBuffer(object): + def __init__(self, max_length: int = 1000, length: int = 50): + """Data structure that stores episodes and samples chunks + of size length from episodes + + Args: + max_length: Maximum episodes it can store + length: Episode chunking lengh in sample() + """ + + # Stores all episodes into a list: List[SampleBatchType] + self.episodes = [] + self.max_length = max_length + self.timesteps = 0 + self.length = length + + def add(self, batch: SampleBatchType): + """Splits a SampleBatch into episodes and adds episodes + to the episode buffer + + Args: + batch: SampleBatch to be added + """ + + self.timesteps += batch.count + episodes = batch.split_by_episode() + + for i, e in enumerate(episodes): + episodes[i] = self.preprocess_episode(e) + self.episodes.extend(episodes) + + if len(self.episodes) > self.max_length: + delta = len(self.episodes) - self.max_length + # Drop oldest episodes + self.episodes = self.episodes[delta:] + + def preprocess_episode(self, episode: SampleBatchType): + """Batch format should be in the form of (s_t, a_(t-1), r_(t-1)) + When t=0, the resetted obs is paired with action and reward of 0. + + Args: + episode: SampleBatch representing an episode + """ + obs = episode["obs"] + new_obs = episode["new_obs"] + action = episode["actions"] + reward = episode["rewards"] + + act_shape = action.shape + act_reset = np.array([0.0] * act_shape[-1])[None] + rew_reset = np.array(0.0)[None] + obs_end = np.array(new_obs[act_shape[0] - 1])[None] + + batch_obs = np.concatenate([obs, obs_end], axis=0) + batch_action = np.concatenate([act_reset, action], axis=0) + batch_rew = np.concatenate([rew_reset, reward], axis=0) + + new_batch = { + "obs": batch_obs, + "rewards": batch_rew, + "actions": batch_action + } + return SampleBatch(new_batch) + + def sample(self, batch_size: int): + """Samples [batch_size, length] from the list of episodes + + Args: + batch_size: batch_size to be sampled + """ + episodes_buffer = [] + while len(episodes_buffer) < batch_size: + rand_index = random.randint(0, len(self.episodes) - 1) + episode = self.episodes[rand_index] + if episode.count < self.length: + continue + available = episode.count - self.length + index = int(random.randint(0, available)) + episodes_buffer.append(episode.slice(index, index + self.length)) + + batch = {} + for k in episodes_buffer[0].keys(): + batch[k] = np.stack([e[k] for e in episodes_buffer], axis=0) + + return SampleBatch(batch) + + +def total_sampled_timesteps(worker): + return worker.policy_map[DEFAULT_POLICY_ID].global_timestep + + +class DreamerIteration: + def __init__(self, worker, episode_buffer, dreamer_train_iters, batch_size, + act_repeat): + self.worker = worker + self.episode_buffer = episode_buffer + self.dreamer_train_iters = dreamer_train_iters + self.repeat = act_repeat + self.batch_size = batch_size + + def __call__(self, samples): + + # Dreamer Training Loop + for n in range(self.dreamer_train_iters): + print(n) + batch = self.episode_buffer.sample(self.batch_size) + if n == self.dreamer_train_iters - 1: + batch["log_gif"] = True + fetches = self.worker.learn_on_batch(batch) + + # Custom Logging + policy_fetches = self.policy_stats(fetches) + if "log_gif" in policy_fetches: + gif = policy_fetches["log_gif"] + policy_fetches["log_gif"] = self.postprocess_gif(gif) + + # Metrics Calculation + metrics = _get_shared_metrics() + metrics.info[LEARNER_INFO] = fetches + metrics.counters[STEPS_SAMPLED_COUNTER] = self.episode_buffer.timesteps + metrics.counter[STEPS_SAMPLED_COUNTER] *= self.repeat + res = collect_metrics(local_worker=self.worker) + res["info"] = metrics.info + res["info"].update(metrics.counters) + res["timesteps_total"] = metrics.counters[STEPS_SAMPLED_COUNTER] + + self.episode_buffer.add(samples) + return res + + def postprocess_gif(self, gif: np.ndarray): + gif = np.clip(255 * gif, 0, 255).astype(np.uint8) + B, T, C, H, W = gif.shape + frames = gif.transpose((1, 2, 3, 0, 4)).reshape((1, T, C, H, B * W)) + return frames + + def policy_stats(self, fetches): + return fetches["default_policy"]["learner_stats"] + + +def execution_plan(workers, config): + # Special Replay Buffer for Dreamer agent + episode_buffer = EpisodicBuffer(length=config["batch_length"]) + + local_worker = workers.local_worker() + + # Prefill episode buffer with initial exploration (uniform sampling) + while total_sampled_timesteps(local_worker) < config["prefill_timesteps"]: + samples = local_worker.sample() + episode_buffer.add(samples) + + batch_size = config["batch_size"] + dreamer_train_iters = config["dreamer_train_iters"] + act_repeat = config["action_repeat"] + + rollouts = ParallelRollouts(workers) + rollouts = rollouts.for_each( + DreamerIteration(local_worker, episode_buffer, dreamer_train_iters, + batch_size, act_repeat)) + return rollouts + + +def get_policy_class(config): + return DreamerTorchPolicy + + +def validate_config(config): + config["action_repeat"] = config["env_config"]["frame_skip"] + if config["framework"] != "torch": + raise ValueError("Dreamer not supported in Tensorflow yet!") + if config["batch_mode"] != "complete_episodes": + raise ValueError("truncate_episodes not supported") + if config["num_workers"] != 0: + raise ValueError("Distributed Dreamer not supported yet!") + if config["clip_actions"]: + raise ValueError("Clipping is done inherently via policy tanh!") + if config["action_repeat"] > 1: + config["horizon"] = config["horizon"] / config["action_repeat"] + + +DREAMERTrainer = build_trainer( + name="Dreamer", + default_config=DEFAULT_CONFIG, + default_policy=DreamerTorchPolicy, + get_policy_class=get_policy_class, + execution_plan=execution_plan, + validate_config=validate_config) diff --git a/rllib/agents/dreamer/dreamer_model.py b/rllib/agents/dreamer/dreamer_model.py new file mode 100644 index 000000000..2daeee643 --- /dev/null +++ b/rllib/agents/dreamer/dreamer_model.py @@ -0,0 +1,559 @@ +import numpy as np +from typing import Any, List, Tuple +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.utils.framework import TensorType + +torch, nn = try_import_torch() +if torch: + from torch import distributions as td + from ray.rllib.agents.dreamer.utils import Linear, Conv2d, \ + ConvTranspose2d, GRUCell, TanhBijector + +ActFunc = Any + +# Encoder, part of PlaNET +if torch: + + class ConvEncoder(nn.Module): + """Standard Convolutional Encoder for Dreamer. This encoder is used + to encode images frm an enviornment into a latent state for the + RSSM model in PlaNET. + """ + + def __init__(self, + depth: int = 32, + act: ActFunc = None, + shape: List = [3, 64, 64]): + """Initializes Conv Encoder + + Args: + depth (int): Number of channels in the first conv layer + act (Any): Activation for Encoder, default ReLU + shape (List): Shape of observation input + """ + super().__init__() + self.act = act + if not act: + self.act = nn.ReLU + self.depth = depth + self.shape = shape + + init_channels = self.shape[0] + self.layers = [ + Conv2d(init_channels, self.depth, 4, stride=2), + self.act(), + Conv2d(self.depth, 2 * self.depth, 4, stride=2), + self.act(), + Conv2d(2 * self.depth, 4 * self.depth, 4, stride=2), + self.act(), + Conv2d(4 * self.depth, 8 * self.depth, 4, stride=2), + self.act(), + ] + self.model = nn.Sequential(*self.layers) + + def forward(self, x): + # Flatten to [batch*horizon, 3, 64, 64] in loss function + orig_shape = list(x.size()) + x = x.view(-1, *(orig_shape[-3:])) + x = self.model(x) + + new_shape = orig_shape[:-3] + [32 * self.depth] + x = x.view(*new_shape) + return x + + +if torch: + + class Reshape(nn.Module): + """Standard module that reshapes/views a tensor + """ + + def __init__(self, shape: List): + super().__init__() + self.shape = shape + + def forward(self, x): + return x.view(*self.shape) + + +# Decoder, part of PlaNET +if torch: + + class ConvDecoder(nn.Module): + """Standard Convolutional Decoder for Dreamer. This decoder is used + to decoder images from the latent state generated by the transition + dynamics model. This is used in calulating loss and logging gifs for + imagine trajectories. + """ + + def __init__(self, + input_size: int, + depth: int = 32, + act: ActFunc = None, + shape: List = [3, 64, 64]): + """Initializes Conv Decoder + + Args: + input_size (int): Input size, usually feature size output from RSSM + depth (int): Number of channels in the first conv layer + act (Any): Activation for Encoder, default ReLU + shape (List): Shape of observation input + """ + super().__init__() + self.act = act + if not act: + self.act = nn.ReLU + self.depth = depth + self.shape = shape + + self.layers = [ + Linear(input_size, 32 * self.depth), + Reshape((-1, 32 * self.depth, 1, 1)), + ConvTranspose2d(32 * self.depth, 4 * self.depth, 5, stride=2), + self.act(), + ConvTranspose2d(4 * self.depth, 2 * self.depth, 5, stride=2), + self.act(), + ConvTranspose2d(2 * self.depth, self.depth, 6, stride=2), + self.act(), + ConvTranspose2d(self.depth, self.shape[0], 6, stride=2), + ] + self.model = nn.Sequential(*self.layers) + + def forward(self, x): + # x is [batch, hor_length, input_size] + orig_shape = list(x.size()) + x = self.model(x) + + reshape_size = orig_shape[:-1] + self.shape + mean = x.view(*reshape_size) + + # Equivalent to making a multivariate diag + return td.Independent(td.Normal(mean, 1), len(self.shape)) + + +# Reward Model (PlaNET), and Value Function +if torch: + + class DenseDecoder(nn.Module): + """Fully Connected network that outputs a distribution for calculating log_prob + later in DreamerLoss + """ + + def __init__(self, + input_size: int, + output_size: int, + layers: int, + units: int, + dist: str = "normal", + act: ActFunc = None): + """Initializes FC network + + Args: + input_size (int): Input size to network + output_size (int): Output size to network + layers (int): Number of layers in network + units (int): Size of the hidden layers + dist (str): Output distribution, parameterized by FC output logits + act (Any): Activation function + """ + super().__init__() + self.layrs = layers + self.units = units + self.act = act + if not act: + self.act = nn.ELU + self.dist = dist + self.input_size = input_size + self.output_size = output_size + self.layers = [] + cur_size = input_size + for _ in range(self.layrs): + self.layers.extend([Linear(cur_size, self.units), self.act()]) + cur_size = units + self.layers.append(Linear(cur_size, output_size)) + self.model = nn.Sequential(*self.layers) + + def forward(self, x): + x = self.model(x) + if self.output_size == 1: + x = torch.squeeze(x) + if self.dist == "normal": + output_dist = td.Normal(x, 1) + elif self.dist == "binary": + output_dist = td.Bernoulli(logits=x) + else: + raise NotImplementedError("Distribution type not implemented!") + return td.Independent(output_dist, 0) + + +# Represents dreamer policy +if torch: + + class ActionDecoder(nn.Module): + """ActionDecoder is the policy module in Dreamer. It outputs a distribution + parameterized by mean and std, later to be transformed by a custom + TanhBijector in utils.py for Dreamer. + """ + + def __init__(self, + input_size: int, + action_size: int, + layers: int, + units: int, + dist: str = "tanh_normal", + act: ActFunc = None, + min_std: float = 1e-4, + init_std: float = 5.0, + mean_scale: float = 5.0): + """Initializes Policy + + Args: + input_size (int): Input size to network + action_size (int): Action space size + layers (int): Number of layers in network + units (int): Size of the hidden layers + dist (str): Output distribution, with tanh_normal implemented + act (Any): Activation function + min_std (float): Minimum std for output distribution + init_std (float): Intitial std + mean_scale (float): Augmenting mean output from FC network + """ + super().__init__() + self.layrs = layers + self.units = units + self.dist = dist + self.act = act + if not act: + self.act = nn.ReLU + self.min_std = min_std + self.init_std = init_std + self.mean_scale = mean_scale + self.action_size = action_size + + self.layers = [] + self.softplus = nn.Softplus() + + # MLP Construction + cur_size = input_size + for _ in range(self.layrs): + self.layers.extend([Linear(cur_size, self.units), self.act()]) + cur_size = self.units + if self.dist == "tanh_normal": + self.layers.append(Linear(cur_size, 2 * action_size)) + elif self.dist == "onehot": + self.layers.append(Linear(cur_size, action_size)) + self.model = nn.Sequential(*self.layers) + + # Returns distribution + def forward(self, x): + raw_init_std = np.log(np.exp(self.init_std) - 1) + x = self.model(x) + if self.dist == "tanh_normal": + mean, std = torch.chunk(x, 2, dim=-1) + mean = self.mean_scale * torch.tanh(mean / self.mean_scale) + std = self.softplus(std + raw_init_std) + self.min_std + dist = td.Normal(mean, std) + transforms = [TanhBijector()] + dist = td.transformed_distribution.TransformedDistribution( + dist, transforms) + dist = td.Independent(dist, 1) + elif self.dist == "onehot": + dist = td.OneHotCategorical(logits=x) + raise NotImplementedError("Atari not implemented yet!") + return dist + + +# Represents TD model in PlaNET +if torch: + + class RSSM(nn.Module): + """RSSM is the core recurrent part of the PlaNET module. It consists of + two networks, one (obs) to calculate posterior beliefs and states and + the second (img) to calculate prior beliefs and states. The prior network + takes in the previous state and action, while the posterior network takes + in the previous state, action, and a latent embedding of the most recent + observation. + """ + + def __init__(self, + action_size: int, + embed_size: int, + stoch: int = 30, + deter: int = 200, + hidden: int = 200, + act: ActFunc = None): + """Initializes RSSM + + Args: + action_size (int): Action space size + embed_size (int): Size of ConvEncoder embedding + stoch (int): Size of the distributional hidden state + deter (int): Size of the deterministic hidden state + hidden (int): General size of hidden layers + act (Any): Activation function + """ + super().__init__() + self.stoch_size = stoch + self.deter_size = deter + self.hidden_size = hidden + self.act = act + if act is None: + self.act = nn.ELU + + self.obs1 = Linear(embed_size + deter, hidden) + self.obs2 = Linear(hidden, 2 * stoch) + + self.cell = GRUCell(self.hidden_size, hidden_size=self.deter_size) + self.img1 = Linear(stoch + action_size, hidden) + self.img2 = Linear(deter, hidden) + self.img3 = Linear(hidden, 2 * stoch) + + self.softplus = nn.Softplus + + self.device = (torch.device("cuda") if torch.cuda.is_available() + else torch.device("cpu")) + + def get_initial_state(self, batch_size: int) -> List[TensorType]: + """Returns the inital state for the RSSM, which consists of mean, std + for the stochastic state, the sampled stochastic hidden state + (from mean, std), and the deterministic hidden state, which is pushed + through the GRUCell. + + Args: + batch_size (int): Batch size for initial state + + Returns: + List of tensors + """ + return [ + torch.zeros(batch_size, self.stoch_size).to(self.device), + torch.zeros(batch_size, self.stoch_size).to(self.device), + torch.zeros(batch_size, self.stoch_size).to(self.device), + torch.zeros(batch_size, self.deter_size).to(self.device), + ] + + def observe(self, + embed: TensorType, + action: TensorType, + state: List[TensorType] = None + ) -> Tuple[List[TensorType], List[TensorType]]: + """Returns the corresponding states from the embedding from ConvEncoder + and actions. This is accomplished by rolling out the RNN from the + starting state through eacn index of embed and action, saving all + intermediate states between. + + Args: + embed (TensorType): ConvEncoder embedding + action (TensorType): Actions + state (List[TensorType]): Initial state before rollout + + Returns: + Posterior states and prior states (both List[TensorType]) + """ + if state is None: + state = self.get_initial_state(action.size()[0]) + + embed = embed.permute(1, 0, 2) + action = action.permute(1, 0, 2) + + priors = [[] for i in range(len(state))] + posts = [[] for i in range(len(state))] + last = (state, state) + for index in range(len(action)): + # Tuple of post and prior + last = self.obs_step(last[0], action[index], embed[index]) + [o.append(l) for l, o in zip(last[0], posts)] + [o.append(l) for l, o in zip(last[1], priors)] + + prior = [torch.stack(x, dim=0) for x in priors] + post = [torch.stack(x, dim=0) for x in posts] + + prior = [e.permute(1, 0, 2) for e in prior] + post = [e.permute(1, 0, 2) for e in post] + + return post, prior + + def imagine(self, action: TensorType, + state: List[TensorType] = None) -> List[TensorType]: + """Imagines the trajectory starting from state through a list of actions. + Similar to observe(), requires rolling out the RNN for each timestep. + + Args: + action (TensorType): Actions + state (List[TensorType]): Starting state before rollout + + Returns: + Prior states + """ + if state is None: + state = self.get_initial_state(action.size()[0]) + + action = action.permute(1, 0, 2) + + indices = range(len(action)) + priors = [[] for _ in range(len(state))] + last = state + for index in indices: + last = self.img_step(last, action[index]) + [o.append(l) for l, o in zip(last, priors)] + + prior = [torch.stack(x, dim=0) for x in priors] + prior = [e.permute(1, 0, 2) for e in prior] + return prior + + def obs_step(self, prev_state: TensorType, prev_action: TensorType, + embed: TensorType + ) -> Tuple[List[TensorType], List[TensorType]]: + """Runs through the posterior model and returns the posterior state + + Args: + prev_state (TensorType): The previous state + prev_action (TensorType): The previous action + embed (TensorType): Embedding from ConvEncoder + + Returns: + Post and Prior state + """ + prior = self.img_step(prev_state, prev_action) + x = torch.cat([prior[3], embed], dim=-1) + x = self.obs1(x) + x = self.act()(x) + x = self.obs2(x) + mean, std = torch.chunk(x, 2, dim=-1) + std = self.softplus()(std) + 0.1 + stoch = self.get_dist(mean, std).rsample() + post = [mean, std, stoch, prior[3]] + return post, prior + + def img_step(self, prev_state: TensorType, + prev_action: TensorType) -> List[TensorType]: + """Runs through the prior model and returns the prior state + + Args: + prev_state (TensorType): The previous state + prev_action (TensorType): The previous action + + Returns: + Prior state + """ + x = torch.cat([prev_state[2], prev_action], dim=-1) + x = self.img1(x) + x = self.act()(x) + deter = self.cell(x, prev_state[3]) + x = deter + x = self.img2(x) + x = self.act()(x) + x = self.img3(x) + mean, std = torch.chunk(x, 2, dim=-1) + std = self.softplus()(std) + 0.1 + stoch = self.get_dist(mean, std).rsample() + return [mean, std, stoch, deter] + + def get_feature(self, state: List[TensorType]) -> TensorType: + # Constructs feature for input to reward, decoder, actor, critic + return torch.cat([state[2], state[3]], dim=-1) + + def get_dist(self, mean: TensorType, std: TensorType) -> TensorType: + return td.Normal(mean, std) + + +# Represents all models in Dreamer, unifies them all into a single interface +if torch: + + class DreamerModel(TorchModelV2, nn.Module): + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + super().__init__(obs_space, action_space, num_outputs, + model_config, name) + + nn.Module.__init__(self) + self.depth = model_config["depth_size"] + self.deter_size = model_config["deter_size"] + self.stoch_size = model_config["stoch_size"] + self.hidden_size = model_config["hidden_size"] + + self.action_size = action_space.shape[0] + + self.encoder = ConvEncoder(self.depth) + self.decoder = ConvDecoder( + self.stoch_size + self.deter_size, depth=self.depth) + self.reward = DenseDecoder(self.stoch_size + self.deter_size, 1, 2, + self.hidden_size) + self.dynamics = RSSM( + self.action_size, + 32 * self.depth, + stoch=self.stoch_size, + deter=self.deter_size) + self.actor = ActionDecoder(self.stoch_size + self.deter_size, + self.action_size, 4, self.hidden_size) + self.value = DenseDecoder(self.stoch_size + self.deter_size, 1, 3, + self.hidden_size) + self.state = None + + self.device = (torch.device("cuda") if torch.cuda.is_available() + else torch.device("cpu")) + + def policy(self, + obs: TensorType, + state: List[TensorType], + explore=True + ) -> Tuple[TensorType, List[float], List[TensorType]]: + """Returns the action. Runs through the encoder, recurrent model, + and policy to obtain action. + """ + if state is None: + self.initial_state() + else: + self.state = state + post = self.state[:4] + action = self.state[4] + + embed = self.encoder(obs) + post, _ = self.dynamics.obs_step(post, action, embed) + feat = self.dynamics.get_feature(post) + + action_dist = self.actor(feat) + if explore: + action = action_dist.sample() + else: + action = action_dist.mean + logp = action_dist.log_prob(action) + + self.state = post + [action] + return action, logp, self.state + + def imagine_ahead(self, state: List[TensorType], + horizon: int) -> TensorType: + """Given a batch of states, rolls out more state of length horizon. + """ + start = [] + for s in state: + s = s.contiguous().detach() + shpe = [-1] + list(s.size())[2:] + start.append(s.view(*shpe)) + + def next_state(state): + feature = self.dynamics.get_feature(state).detach() + action = self.actor(feature).rsample() + next_state = self.dynamics.img_step(state, action) + return next_state + + last = start + outputs = [[] for i in range(len(start))] + for _ in range(horizon): + last = next_state(last) + [o.append(l) for l, o in zip(last, outputs)] + outputs = [torch.stack(x, dim=0) for x in outputs] + + imag_feat = self.dynamics.get_feature(outputs) + return imag_feat + + def get_initial_state(self) -> List[TensorType]: + self.state = self.dynamics.get_initial_state(1) + [ + torch.zeros(1, self.action_space.shape[0]).to(self.device) + ] + return self.state + + def value_function(self) -> TensorType: + return None diff --git a/rllib/agents/dreamer/dreamer_torch_policy.py b/rllib/agents/dreamer/dreamer_torch_policy.py new file mode 100644 index 000000000..f9abd10c8 --- /dev/null +++ b/rllib/agents/dreamer/dreamer_torch_policy.py @@ -0,0 +1,247 @@ +import logging + +import ray +from ray.rllib.policy.torch_policy_template import build_torch_policy +from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping +from ray.rllib.utils.framework import try_import_torch +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.agents.dreamer.utils import FreezeParameters + +torch, nn = try_import_torch() +if torch: + from torch import distributions as td + +logger = logging.getLogger(__name__) + + +# This is the computation graph for workers (inner adaptation steps) +def compute_dreamer_loss(obs, + action, + reward, + model, + imagine_horizon, + discount=0.99, + lambda_=0.95, + kl_coeff=1.0, + free_nats=3.0, + log=False): + """Constructs loss for the Dreamer objective + + Args: + obs (TensorType): Observations (o_t) + action (TensorType): Actions (a_(t-1)) + reward (TensorType): Rewards (r_(t-1)) + model (TorchModelV2): DreamerModel, encompassing all other models + imagine_horizon (int): Imagine horizon for actor and critic loss + discount (float): Discount + lambda_ (float): Lambda, like in GAE + kl_coeff (float): KL Coefficient for Divergence loss in model loss + free_nats (float): Threshold for minimum divergence in model loss + log (bool): If log, generate gifs + """ + encoder_weights = list(model.encoder.parameters()) + decoder_weights = list(model.decoder.parameters()) + reward_weights = list(model.reward.parameters()) + dynamics_weights = list(model.dynamics.parameters()) + critic_weights = list(model.value.parameters()) + model_weights = list(encoder_weights + decoder_weights + reward_weights + + dynamics_weights) + + device = (torch.device("cuda") + if torch.cuda.is_available() else torch.device("cpu")) + + # PlaNET Model Loss + latent = model.encoder(obs) + post, prior = model.dynamics.observe(latent, action) + features = model.dynamics.get_feature(post) + image_pred = model.decoder(features) + reward_pred = model.reward(features) + image_loss = -torch.mean(image_pred.log_prob(obs)) + reward_loss = -torch.mean(reward_pred.log_prob(reward)) + prior_dist = model.dynamics.get_dist(prior[0], prior[1]) + post_dist = model.dynamics.get_dist(post[0], post[1]) + div = torch.mean( + torch.distributions.kl_divergence(post_dist, prior_dist).sum(dim=2)) + div = torch.clamp(div, min=free_nats) + model_loss = kl_coeff * div + reward_loss + image_loss + + # Actor Loss + # [imagine_horizon, batch_length*batch_size, feature_size] + with torch.no_grad(): + actor_states = [v.detach() for v in post] + with FreezeParameters(model_weights): + imag_feat = model.imagine_ahead(actor_states, imagine_horizon) + with FreezeParameters(model_weights + critic_weights): + reward = model.reward(imag_feat).mean + value = model.value(imag_feat).mean + pcont = discount * torch.ones_like(reward) + returns = lambda_return(reward[:-1], value[:-1], pcont[:-1], value[-1], + lambda_) + discount_shape = pcont[:1].size() + discount = torch.cumprod( + torch.cat([torch.ones(*discount_shape).to(device), pcont[:-2]], dim=0), + dim=0) + actor_loss = -torch.mean(discount * returns) + + # Critic Loss + with torch.no_grad(): + val_feat = imag_feat.detach()[:-1] + target = returns.detach() + val_discount = discount.detach() + val_pred = model.value(val_feat) + critic_loss = -torch.mean(val_discount * val_pred.log_prob(target)) + + # Logging purposes + prior_ent = torch.mean(prior_dist.entropy()) + post_ent = torch.mean(post_dist.entropy()) + + log_gif = None + if log: + log_gif = log_summary(obs, action, latent, image_pred, model) + + return_dict = { + "model_loss": model_loss, + "reward_loss": reward_loss, + "image_loss": image_loss, + "divergence": div, + "actor_loss": actor_loss, + "critic_loss": critic_loss, + "prior_ent": prior_ent, + "post_ent": post_ent, + } + + if log_gif is not None: + return_dict["log_gif"] = log_gif + return return_dict + + +# Similar to GAE-Lambda, calculate value targets +def lambda_return(reward, value, pcont, bootstrap, lambda_): + def agg_fn(x, y): + return y[0] + y[1] * lambda_ * x + + next_values = torch.cat([value[1:], bootstrap[None]], dim=0) + inputs = reward + pcont * next_values * (1 - lambda_) + + last = bootstrap + returns = [] + for i in reversed(range(len(inputs))): + last = agg_fn(last, [inputs[i], pcont[i]]) + returns.append(last) + + returns = list(reversed(returns)) + returns = torch.stack(returns, dim=0) + return returns + + +# Creates gif +def log_summary(obs, action, embed, image_pred, model): + truth = obs[:6] + 0.5 + recon = image_pred.mean[:6] + init, _ = model.dynamics.observe(embed[:6, :5], action[:6, :5]) + init = [itm[:, -1] for itm in init] + prior = model.dynamics.imagine(action[:6, 5:], init) + openl = model.decoder(model.dynamics.get_feature(prior)).mean + + mod = torch.cat([recon[:, :5] + 0.5, openl + 0.5], 1) + error = (mod - truth + 1.0) / 2.0 + return torch.cat([truth, mod, error], 3) + + +def dreamer_loss(policy, model, dist_class, train_batch): + log_gif = False + if "log_gif" in train_batch: + log_gif = True + + policy.stats_dict = compute_dreamer_loss( + train_batch["obs"], + train_batch["actions"], + train_batch["rewards"], + policy.model, + policy.config["imagine_horizon"], + policy.config["discount"], + policy.config["lambda"], + policy.config["kl_coeff"], + policy.config["free_nats"], + log_gif, + ) + + loss_dict = policy.stats_dict + + return (loss_dict["model_loss"], loss_dict["actor_loss"], + loss_dict["critic_loss"]) + + +def build_dreamer_model(policy, obs_space, action_space, config): + + policy.model = ModelCatalog.get_model_v2( + obs_space, + action_space, + 1, + config["dreamer_model"], + name="DreamerModel", + framework="torch") + + policy.model_variables = policy.model.variables() + + return policy.model + + +def action_sampler_fn(policy, model, input_dict, state, explore, timestep): + """Action sampler function has two phases. During the prefill phase, + actions are sampled uniformly [-1, 1]. During training phase, actions + are evaluated through DreamerPolicy and an additive gaussian is added + to incentivize exploration. + """ + obs = input_dict["obs"] + + # Custom Exploration + if timestep <= policy.config["prefill_timesteps"]: + logp = [0.0] + # Random action in space [-1.0, 1.0] + action = 2.0 * torch.rand(1, model.action_space.shape[0]) - 1.0 + state = model.get_initial_state() + else: + # Weird RLLib Handling, this happens when env rests + if len(state[0].size()) == 3: + # Very hacky, but works on all envs + state = model.get_initial_state() + action, logp, state = model.policy(obs, state, explore) + action = td.Normal(action, policy.config["explore_noise"]).sample() + action = torch.clamp(action, min=-1.0, max=1.0) + + policy.global_timestep += policy.config["action_repeat"] + + return action, logp, state + + +def dreamer_stats(policy, train_batch): + return policy.stats_dict + + +def dreamer_optimizer_fn(policy, config): + model = policy.model + encoder_weights = list(model.encoder.parameters()) + decoder_weights = list(model.decoder.parameters()) + reward_weights = list(model.reward.parameters()) + dynamics_weights = list(model.dynamics.parameters()) + actor_weights = list(model.actor.parameters()) + critic_weights = list(model.value.parameters()) + model_opt = torch.optim.Adam( + encoder_weights + decoder_weights + reward_weights + dynamics_weights, + lr=config["td_model_lr"]) + actor_opt = torch.optim.Adam(actor_weights, lr=config["actor_lr"]) + critic_opt = torch.optim.Adam(critic_weights, lr=config["critic_lr"]) + + return (model_opt, actor_opt, critic_opt) + + +DreamerTorchPolicy = build_torch_policy( + name="DreamerTorchPolicy", + get_default_config=lambda: ray.rllib.agents.dreamer.dreamer.DEFAULT_CONFIG, + action_sampler_fn=action_sampler_fn, + loss_fn=dreamer_loss, + stats_fn=dreamer_stats, + make_model=build_dreamer_model, + optimizer_fn=dreamer_optimizer_fn, + extra_grad_process_fn=apply_grad_clipping) diff --git a/rllib/agents/dreamer/utils.py b/rllib/agents/dreamer/utils.py new file mode 100644 index 000000000..e86707ade --- /dev/null +++ b/rllib/agents/dreamer/utils.py @@ -0,0 +1,94 @@ +from ray.rllib.utils.framework import try_import_torch +import numpy as np + +torch, nn = try_import_torch() + +# Custom initialization for different types of layers +if torch: + + class Linear(nn.Linear): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + +if torch: + + class Conv2d(nn.Conv2d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + +if torch: + + class ConvTranspose2d(nn.ConvTranspose2d): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight) + if self.bias is not None: + nn.init.zeros_(self.bias) + + +if torch: + + class GRUCell(nn.GRUCell): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def reset_parameters(self): + nn.init.xavier_uniform_(self.weight_ih) + nn.init.orthogonal_(self.weight_hh) + nn.init.zeros_(self.bias_ih) + nn.init.zeros_(self.bias_hh) + + +# Custom Tanh Bijector due to big gradients through Dreamer Actor +if torch: + + class TanhBijector(torch.distributions.Transform): + def __init__(self): + super().__init__() + + def atanh(self, x): + return 0.5 * torch.log((1 + x) / (1 - x)) + + def sign(self): + return 1. + + def _call(self, x): + return torch.tanh(x) + + def _inverse(self, y): + y = torch.where((torch.abs(y) <= 1.), + torch.clamp(y, -0.99999997, 0.99999997), y) + y = self.atanh(y) + return y + + def log_abs_det_jacobian(self, x, y): + return 2. * (np.log(2) - x - nn.functional.softplus(-2. * x)) + + +# Modified from https://github.com/juliusfrost/dreamer-pytorch +class FreezeParameters: + def __init__(self, parameters): + self.parameters = parameters + self.param_states = [p.requires_grad for p in self.parameters] + + def __enter__(self): + for param in self.parameters: + param.requires_grad = False + + def __exit__(self, exc_type, exc_val, exc_tb): + for i, param in enumerate(self.parameters): + param.requires_grad = self.param_states[i] diff --git a/rllib/agents/registry.py b/rllib/agents/registry.py index a1dbe2b2d..2f46106c7 100644 --- a/rllib/agents/registry.py +++ b/rllib/agents/registry.py @@ -105,6 +105,11 @@ def _import_mbmpo(): return mbmpo.MBMPOTrainer +def _import_dreamer(): + from ray.rllib.agents import dreamer + return dreamer.DREAMERTrainer + + ALGORITHMS = { "SAC": _import_sac, "DDPG": _import_ddpg, @@ -126,6 +131,7 @@ ALGORITHMS = { "MARWIL": _import_marwil, "MAML": _import_maml, "MBMPO": _import_mbmpo, + "DREAMER": _import_dreamer, } diff --git a/rllib/env/dm_control_wrapper.py b/rllib/env/dm_control_wrapper.py index 212bd9e9b..6734e2a3a 100644 --- a/rllib/env/dm_control_wrapper.py +++ b/rllib/env/dm_control_wrapper.py @@ -73,19 +73,20 @@ class DMCEnv(core.Env): task_kwargs=None, visualize_reward=False, from_pixels=False, - height=84, - width=84, + height=64, + width=64, camera_id=0, - frame_skip=1, + frame_skip=2, environment_kwargs=None, - channels_first=False): - assert "random" in task_kwargs, "Seed for deterministic behaviour" + channels_first=True, + preprocess=True): self._from_pixels = from_pixels self._height = height self._width = width self._camera_id = camera_id self._frame_skip = frame_skip self._channels_first = channels_first + self.preprocess = preprocess if specs is None: raise RuntimeError(( @@ -120,6 +121,9 @@ class DMCEnv(core.Env): width] if channels_first else [height, width, 3] self._observation_space = spaces.Box( low=0, high=255, shape=shape, dtype=np.uint8) + if preprocess: + self._observation_space = spaces.Box( + low=-0.5, high=0.5, shape=shape, dtype=np.float32) else: self._observation_space = _spec_to_box( self._env.observation_spec().values()) @@ -128,9 +132,6 @@ class DMCEnv(core.Env): self.current_state = None - # set seed - self.seed(seed=task_kwargs.get("random", 1)) - def __getattr__(self, name): return getattr(self._env, name) @@ -142,6 +143,8 @@ class DMCEnv(core.Env): camera_id=self._camera_id) if self._channels_first: obs = obs.transpose(2, 0, 1).copy() + if self.preprocess: + obs = obs / 255.0 - 0.5 else: obs = _flatten_obs(time_step.observation) return obs @@ -167,11 +170,6 @@ class DMCEnv(core.Env): def action_space(self): return self._norm_action_space - def seed(self, seed): - self._true_action_space.seed(seed) - self._norm_action_space.seed(seed) - self._observation_space.seed(seed) - def step(self, action): assert self._norm_action_space.contains(action) action = self._convert_action(action) diff --git a/rllib/examples/env/dm_control_suite.py b/rllib/examples/env/dm_control_suite.py index abddc3eb9..165344794 100644 --- a/rllib/examples/env/dm_control_suite.py +++ b/rllib/examples/env/dm_control_suite.py @@ -1,85 +1,139 @@ from ray.rllib.env.dm_control_wrapper import DMCEnv -import numpy as np """ 8 Environments from Deepmind Control Suite """ -def acrobot_swingup(): +def acrobot_swingup(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "acrobot", "swingup", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def hopper_hop(): +def walker_walk(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): + return DMCEnv( + "walker", + "walk", + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) + + +def hopper_hop(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "hopper", "hop", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def hopper_stand(): +def hopper_stand(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "hopper", "stand", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def cheetah_run(): +def cheetah_run(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "cheetah", "run", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def walker_run(): +def walker_run(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "walker", "run", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def pendulum_swingup(): +def pendulum_swingup(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "pendulum", "swingup", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def cartpole_swingup(): +def cartpole_swingup(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "cartpole", "swingup", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) -def humanoid_walk(): +def humanoid_walk(from_pixels=True, + height=64, + width=64, + frame_skip=2, + channels_first=True): return DMCEnv( "humanoid", "walk", - from_pixels=True, - height=64, - width=64, - task_kwargs={"random": np.random.randint(low=0, high=1e9)}) + from_pixels=from_pixels, + height=height, + width=width, + frame_skip=frame_skip, + channels_first=channels_first) diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index 1a908ec86..f5dd7be39 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -225,11 +225,12 @@ class TorchPolicy(Policy): """ if self.action_sampler_fn: action_dist = dist_inputs = None - state_out = [] - actions, logp = self.action_sampler_fn( + state_out = state_batches + actions, logp, state_out = self.action_sampler_fn( self, self.model, - input_dict[SampleBatch.CUR_OBS], + input_dict, + state_out, explore=explore, timestep=timestep) else: @@ -363,6 +364,7 @@ class TorchPolicy(Policy): # Loop through all optimizers. grad_info = {"allreduce_latency": 0.0} + for i, opt in enumerate(self._optimizers): # Erase gradients in all vars of this optimizer. opt.zero_grad() @@ -394,7 +396,8 @@ class TorchPolicy(Policy): grad_info["allreduce_latency"] += time.time() - start - # Step the optimizer. + # Step the optimizer + for i, opt in enumerate(self._optimizers): opt.step() grad_info["allreduce_latency"] /= len(self._optimizers) diff --git a/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml b/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml new file mode 100644 index 000000000..71d28e7fe --- /dev/null +++ b/rllib/tuned_examples/dreamer/dreamer-deepmind-control.yaml @@ -0,0 +1,26 @@ +dmc-dreamer: + run: DREAMER + env: + grid_search: + - ray.rllib.examples.env.dm_control_suite.walker_walk + - ray.rllib.examples.env.dm_control_suite.cheetah_run + - ray.rllib.examples.env.dm_control_suite.hopper_hop + stop: + timesteps_total: 1000000 + config: + framework: torch + td_model_lr: 0.0006 + actor_lr: 0.00008 + critic_lr: 0.00008 + discount: 0.99 + lambda: 0.95 + dreamer_train_iters: 100 + horizon: 1000 + batch_size: 50 + batch_length: 50 + imagine_horizon: 15 + free_nats: 3.0 + batch_mode: complete_episodes + num_gpus: 1 + num_workers: 0 + clip_actions: False