mirror of
https://github.com/wassname/ray.git
synced 2026-07-02 15:05:53 +08:00
[RLlib] Curiosity (intrinsic motivation) Exploration module. (#9912)
This commit is contained in:
@@ -342,6 +342,10 @@ class TorchPolicy(Policy):
|
||||
# Call Model's custom-loss with Policy loss outputs and train_batch.
|
||||
if self.model:
|
||||
loss_out = self.model.custom_loss(loss_out, train_batch)
|
||||
# Modifies the loss as specified by the Exploration strategy.
|
||||
if hasattr(self, "exploration"):
|
||||
loss_out = self.exploration.get_exploration_loss(
|
||||
loss_out, train_batch)
|
||||
assert len(loss_out) == len(self._optimizers)
|
||||
# assert not any(torch.isnan(l) for l in loss_out)
|
||||
fetches = self.extra_compute_grad_fetches()
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.torch_policy import TorchPolicy
|
||||
from ray.rllib.policy.view_requirement import ViewRequirement
|
||||
from ray.rllib.utils import add_mixins
|
||||
from ray.rllib.utils import add_mixins, force_list
|
||||
from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
from ray.rllib.utils.framework import try_import_torch
|
||||
from ray.rllib.utils.torch_ops import convert_to_non_torch_type
|
||||
@@ -294,9 +294,15 @@ def build_torch_policy(
|
||||
@override(TorchPolicy)
|
||||
def optimizer(self):
|
||||
if optimizer_fn:
|
||||
return optimizer_fn(self, self.config)
|
||||
optimizers = optimizer_fn(self, self.config)
|
||||
else:
|
||||
return TorchPolicy.optimizer(self)
|
||||
optimizers = TorchPolicy.optimizer(self)
|
||||
optimizers = force_list(optimizers)
|
||||
if hasattr(self, "exploration"):
|
||||
exploration_optimizers = force_list(
|
||||
self.exploration.get_exploration_optimizer(self.config))
|
||||
optimizers.extend(exploration_optimizers)
|
||||
return optimizers
|
||||
|
||||
@override(TorchPolicy)
|
||||
def extra_grad_info(self, train_batch):
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from ray.rllib.utils.exploration.curiosity import Curiosity
|
||||
from ray.rllib.utils.exploration.exploration import Exploration
|
||||
from ray.rllib.utils.exploration.epsilon_greedy import EpsilonGreedy
|
||||
from ray.rllib.utils.exploration.gaussian_noise import GaussianNoise
|
||||
@@ -16,6 +17,7 @@ from ray.rllib.utils.exploration.stochastic_sampling import \
|
||||
StochasticSampling
|
||||
|
||||
__all__ = [
|
||||
"Curiosity",
|
||||
"Exploration",
|
||||
"EpsilonGreedy",
|
||||
"GaussianNoise",
|
||||
|
||||
@@ -0,0 +1,268 @@
|
||||
"""
|
||||
Curiosity-driven Exploration by Self-supervised Prediction - Pathak, Agrawal,
|
||||
Efros, and Darrell - UC Berkeley - ICML 2017.
|
||||
|
||||
This implements the curiosty-based loss function from
|
||||
https://arxiv.org/pdf/1705.05363.pdf. We learn a simplified model of the
|
||||
environment based on three networks:
|
||||
1) embedding states into latent space (the "features" network)
|
||||
2) predicting the next embedded state, given a state and action (the
|
||||
"forwards" network)
|
||||
3) predicting the action, given two consecutive embedded state (the
|
||||
"inverse" network)
|
||||
|
||||
If the agent was unable to successfully predict the state-action-next_state
|
||||
sequence, we modify the standard reward with a penalty. Therefore, if a state
|
||||
transition was unexpected, the agent becomes "curious" and further explores
|
||||
this transition.
|
||||
|
||||
This is tailored for sparse reward environments, as it generates an intrinsic
|
||||
reward.
|
||||
"""
|
||||
from gym.spaces import Space
|
||||
from typing import Union, Optional
|
||||
|
||||
from ray.rllib.models.action_dist import ActionDistribution
|
||||
from ray.rllib.models.torch.misc import SlimFC
|
||||
from ray.rllib.utils.exploration.exploration import Exploration
|
||||
from ray.rllib.utils.framework import try_import_torch, TensorType
|
||||
from ray.rllib.utils.from_config import from_config
|
||||
from ray.rllib.utils.types import SampleBatchType, TrainerConfigDict
|
||||
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
# TODO: (tanay) how to test if action space is discrete
|
||||
"""
|
||||
Example Configuration
|
||||
|
||||
config = ppo.DEFAULT_CONFIG
|
||||
env = "CartPole-v0"
|
||||
config["framework"] = "torch"
|
||||
config["exploration_config"] = {
|
||||
"type": "ray.rllib.utils.exploration.curiosity_exploration.Curiosity",
|
||||
"forward_net_hiddens": [64],
|
||||
"inverse_net_hiddens": [32,4],
|
||||
"feature_net_hiddens": [16,8],
|
||||
"feature_dim": 8,
|
||||
"forward_activation": "relu",
|
||||
"inverse_activation": "relu",
|
||||
"feature_activation": "relu",
|
||||
"submodule": "EpsilonGreedy",
|
||||
}
|
||||
trainer = ppo.PPOTrainer(config=config, env=env)
|
||||
trainer.train()
|
||||
"""
|
||||
|
||||
|
||||
class Curiosity(Exploration):
|
||||
def __init__(self, action_space: Space, *, framework: str, **kwargs):
|
||||
"""
|
||||
Args:
|
||||
action_space (Space): The action space in which to explore.
|
||||
framework (str): One of "tf" or "torch". Currently only torch is
|
||||
supported.
|
||||
"""
|
||||
if framework != "torch":
|
||||
raise NotImplementedError("only torch is currently supported for "
|
||||
"curiosity")
|
||||
|
||||
# Parse the curiosity-specific arguments
|
||||
# If it was not specified in the config, assign the given default
|
||||
def extract_from_kwargs(key, default):
|
||||
if key in kwargs:
|
||||
temp = kwargs[key]
|
||||
del kwargs[key]
|
||||
return temp
|
||||
else:
|
||||
return default
|
||||
|
||||
# Casts a single int to a list, else leaves it unchanged
|
||||
def cast_to_list(l):
|
||||
if type(l) == int:
|
||||
return [l]
|
||||
else:
|
||||
return l
|
||||
|
||||
submodule_type = extract_from_kwargs("submodule", "StochasticSampling")
|
||||
self.feature_dim = extract_from_kwargs("feature_dim", 32)
|
||||
|
||||
forward_activation = extract_from_kwargs("forward_activation", nn.ReLU)
|
||||
inverse_activation = extract_from_kwargs("inverse_activation", nn.ReLU)
|
||||
feature_activation = extract_from_kwargs("feature_activation", nn.ReLU)
|
||||
|
||||
feature_net_hiddens = cast_to_list(
|
||||
extract_from_kwargs("feature_net_hiddens", [64]))
|
||||
inverse_net_hiddens = cast_to_list(
|
||||
extract_from_kwargs("inverse_net_hiddens", [64]))
|
||||
forward_net_hiddens = cast_to_list(
|
||||
extract_from_kwargs("forward_net_hiddens", [64]))
|
||||
|
||||
super().__init__(
|
||||
action_space=action_space, framework=framework, **kwargs)
|
||||
|
||||
# TODO: what should this look like for multidimensional obs spaces
|
||||
self.obs_space_dim = kwargs["model"].obs_space.shape[0]
|
||||
# TODO can we always assume 1
|
||||
self.action_space_dim = 1
|
||||
|
||||
# Given a list of layer dimensions, create a FC ReLU net.
|
||||
# If layer_dims is [4,8,6] we'll have a two layer net: 4->8 and 8->6
|
||||
def create_fc_net(layer_dims, activation):
|
||||
layers = []
|
||||
for i in range(len(layer_dims) - 1):
|
||||
layers.append(
|
||||
SlimFC(
|
||||
in_size=layer_dims[i],
|
||||
out_size=layer_dims[i + 1],
|
||||
use_bias=False,
|
||||
activation_fn=activation))
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
# List of dimension of each layer. Appends the hidden dims.
|
||||
feature_dims = [self.obs_space_dim
|
||||
] + feature_net_hiddens + [self.feature_dim]
|
||||
inverse_dims = [2 * self.feature_dim
|
||||
] + inverse_net_hiddens + [self.action_space_dim]
|
||||
forward_dims = [self.feature_dim + self.action_space_dim] + \
|
||||
forward_net_hiddens + [self.feature_dim]
|
||||
|
||||
# Creates actual models
|
||||
self.feature_model = create_fc_net(feature_dims, feature_activation)
|
||||
self.inverse_model = create_fc_net(inverse_dims, inverse_activation)
|
||||
self.forward_model = create_fc_net(forward_dims, forward_activation)
|
||||
|
||||
# Convenient reductions
|
||||
self.criterion = torch.nn.MSELoss(reduction="none")
|
||||
self.criterion_reduced = torch.nn.MSELoss(reduction="sum")
|
||||
|
||||
# This is only used to select the correct action
|
||||
self.exploration_submodule = from_config(
|
||||
cls=Exploration,
|
||||
config={
|
||||
"type": submodule_type,
|
||||
"action_space": action_space,
|
||||
"framework": framework,
|
||||
"policy_config": self.policy_config,
|
||||
"model": self.model,
|
||||
"num_workers": self.num_workers,
|
||||
"worker_index": self.worker_index
|
||||
})
|
||||
|
||||
def get_exploration_action(self,
|
||||
*,
|
||||
action_distribution: ActionDistribution,
|
||||
timestep: Union[int, TensorType],
|
||||
explore: bool = True):
|
||||
"""
|
||||
Returns the action to take next
|
||||
|
||||
Args:
|
||||
action_distribution (ActionDistribution): The probabilistic
|
||||
distribution we sample actions from
|
||||
timestep (Union[int, TensorType]):
|
||||
explore (bool): If true, uses the submodule strategy to select the
|
||||
next action
|
||||
"""
|
||||
return self.exploration_submodule.get_exploration_action(
|
||||
action_distribution=action_distribution, timestep=timestep)
|
||||
|
||||
def get_exploration_loss(self, policy_loss, sample_batch: SampleBatchType):
|
||||
"""
|
||||
Returns the intrinsic reward associated to the explorations strategy
|
||||
policy_loss (TensorType): The loss from the policy, not associated
|
||||
to the exploration strategy, which we will modify
|
||||
sample_batch (SampleBatchType): The SampleBatch of observations, to
|
||||
which we will associate an intrinsic loss.
|
||||
"""
|
||||
|
||||
# Cast to torch tensors, to be fed into the model
|
||||
obs_list = sample_batch["obs"].float()
|
||||
next_obs_list = sample_batch["new_obs"].float()
|
||||
emb_next_obs_list = self._get_latent_vector(next_obs_list).float()
|
||||
actions_list = sample_batch["actions"].float()
|
||||
|
||||
actions_pred = self._predict_action(obs_list, next_obs_list)
|
||||
embedding_pred = self._predict_next_obs(obs_list, actions_list)
|
||||
|
||||
# L2 losses for predicted action and next state
|
||||
embedding_loss = self.criterion_reduced(emb_next_obs_list,
|
||||
embedding_pred)
|
||||
actions_loss = self.criterion_reduced(
|
||||
actions_pred.squeeze(1), actions_list)
|
||||
return policy_loss + [embedding_loss + actions_loss]
|
||||
|
||||
def _get_latent_vector(self, obs: TensorType) -> TensorType:
|
||||
"""
|
||||
Returns the embedded vector phi(state)
|
||||
obs (TensorType): a batch of states
|
||||
"""
|
||||
return self.feature_model(obs)
|
||||
|
||||
def get_exploration_optimizers(self, config: TrainerConfigDict):
|
||||
"""Returns optimizer (or list) for environmental dynamics networks.
|
||||
"""
|
||||
forward_params = list(self.forward_model.parameters())
|
||||
inverse_params = list(self.inverse_model.parameters())
|
||||
feature_params = list(self.feature_model.parameters())
|
||||
|
||||
return torch.optim.Adam(
|
||||
forward_params + inverse_params + feature_params, lr=1e-3)
|
||||
|
||||
def postprocess_trajectory(self,
|
||||
policy,
|
||||
sample_batch: SampleBatchType,
|
||||
tf_sess: Optional["tf.Session"] = None):
|
||||
"""Calculates intrinsic rewards and adds them to "rewards" in batch.
|
||||
|
||||
Calculations are based on difference between predicted and actually
|
||||
observed next observations.
|
||||
"""
|
||||
|
||||
# Extract the relevant data from the SampleBatch, and cast to Tensors
|
||||
obs_list = torch.from_numpy(sample_batch["obs"]).float()
|
||||
next_obs_list = torch.from_numpy(sample_batch["new_obs"]).float()
|
||||
emb_next_obs_list = self._get_latent_vector(next_obs_list).float()
|
||||
actions_list = torch.from_numpy(sample_batch["actions"]).float()
|
||||
|
||||
# Equation (2) in paper.
|
||||
actions_pred = self._predict_action(obs_list, next_obs_list)
|
||||
embedding_pred = self._predict_next_obs(obs_list, actions_list)
|
||||
|
||||
# A vector of L2 losses corresponding to each observation,
|
||||
# Equation (7) in paper.
|
||||
embedding_loss = torch.sum(
|
||||
self.criterion(emb_next_obs_list, embedding_pred), dim=-1)
|
||||
|
||||
# Equation (3) in paper. TODO discrete action space
|
||||
actions_loss = self.criterion(actions_pred.squeeze(1), actions_list)
|
||||
|
||||
# Modifies environment rewards by subtracting intrinsic rewards
|
||||
sample_batch["rewards"] = sample_batch["rewards"] - \
|
||||
embedding_loss.clone().detach().numpy() - \
|
||||
actions_loss.clone().detach().numpy()
|
||||
|
||||
def _predict_action(self, obs: TensorType, next_obs: TensorType):
|
||||
"""
|
||||
Returns the predicted action, given two states. This is the inverse
|
||||
dynamics model.
|
||||
|
||||
obs (TensorType): Observed state at time t.
|
||||
next_obs (TensorType): Observed state at time t+1
|
||||
"""
|
||||
return self.inverse_model(
|
||||
torch.cat(
|
||||
(self._get_latent_vector(obs),
|
||||
self._get_latent_vector(next_obs)),
|
||||
axis=-1))
|
||||
|
||||
# raw obs (not embedded)
|
||||
def _predict_next_obs(self, obs: TensorType, action: TensorType):
|
||||
"""
|
||||
Returns the predicted next state, given an action and state.
|
||||
|
||||
obs (TensorType): Observed state at time t.
|
||||
action (TensorType): Action taken at time t
|
||||
"""
|
||||
return self.forward_model(
|
||||
torch.cat(
|
||||
(self._get_latent_vector(obs), action.unsqueeze(1)), dim=-1))
|
||||
@@ -1,10 +1,10 @@
|
||||
from gym.spaces import Space
|
||||
from typing import Union
|
||||
|
||||
from ray.rllib.utils.framework import try_import_torch, TensorType
|
||||
from ray.rllib.models.action_dist import ActionDistribution
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.utils.annotations import DeveloperAPI
|
||||
from ray.rllib.utils.framework import try_import_torch, TensorType
|
||||
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
@@ -90,6 +90,28 @@ class Exploration:
|
||||
"""
|
||||
pass
|
||||
|
||||
@DeveloperAPI
|
||||
def get_exploration_loss(self, policy_loss, sample_batch):
|
||||
"""Modifies the policy loss with a loss associated to the exploration
|
||||
strategy.
|
||||
|
||||
Args:
|
||||
policy_loss (TODO): Loss from the Policy
|
||||
sample_batch (SampleBatch): The SampleBatch object to post-process.
|
||||
"""
|
||||
return policy_loss
|
||||
|
||||
@DeveloperAPI
|
||||
def get_exploration_optimizer(self, config=None):
|
||||
"""
|
||||
Returns: an optimizer for the loss from get_exploration_loss (in case
|
||||
the exploration strategy has trainable components)
|
||||
|
||||
Args:
|
||||
config: configuration for an optimizer
|
||||
"""
|
||||
return []
|
||||
|
||||
@DeveloperAPI
|
||||
def on_episode_start(self,
|
||||
policy,
|
||||
|
||||
@@ -0,0 +1,69 @@
|
||||
import numpy as np
|
||||
import ray
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from ray.rllib.utils import check
|
||||
import ray.rllib.agents.ppo as ppo
|
||||
|
||||
|
||||
class TestCuriosity(unittest.TestCase):
|
||||
|
||||
# Sets up a single ray environment for every test.
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
ray.init(local_mode=True)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
ray.shutdown()
|
||||
|
||||
def test_no_curiosity(self):
|
||||
config = ppo.DEFAULT_CONFIG
|
||||
env = "CartPole-v0"
|
||||
dummy_obs = np.array([0.0, 0.1, 0.0, 0.0])
|
||||
prev_a = np.array(0)
|
||||
config["framework"] = "torch"
|
||||
config["exploration_config"] = {"type": "ParameterNoise"}
|
||||
|
||||
trainer = ppo.PPOTrainer(config=config, env=env)
|
||||
trainer.train()
|
||||
|
||||
# Make sure all actions drawn are the same, given same
|
||||
# observations. Tests the explorations API.
|
||||
|
||||
actions = []
|
||||
for _ in range(5):
|
||||
actions.append(
|
||||
trainer.compute_action(
|
||||
observation=dummy_obs,
|
||||
explore=False,
|
||||
prev_action=prev_a,
|
||||
prev_reward=1.0 if prev_a is not None else None))
|
||||
check(actions[-1], actions[0])
|
||||
print(actions)
|
||||
|
||||
def test_curiosity(self):
|
||||
config = ppo.DEFAULT_CONFIG
|
||||
|
||||
env = "CartPole-v0"
|
||||
config["framework"] = "torch"
|
||||
config["exploration_config"] = {
|
||||
"type": "ray.rllib.utils.exploration.curiosity.Curiosity",
|
||||
"forward_net_hiddens": [64],
|
||||
"inverse_net_hiddens": [32, 4],
|
||||
"feature_net_hiddens": [16, 8],
|
||||
"feature_dim": 8,
|
||||
"forward_activation": "relu",
|
||||
"inverse_activation": "relu",
|
||||
"feature_activation": "relu",
|
||||
"submodule": "EpsilonGreedy",
|
||||
}
|
||||
trainer = ppo.PPOTrainer(config=config, env=env)
|
||||
trainer.train()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
Reference in New Issue
Block a user