mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 00:33:25 +08:00
[rllib] dqn/ddpg policy customization (#2445)
* dqn policy update - more customization * docs for custom DQN graph * Update rllib-training.rst * Update rllib-models.rst * Update rllib.rst * Update rllib-training.rst * Update rllib-concepts.rst * yapf codestyle
This commit is contained in:
committed by
Eric Liang
parent
68660453e4
commit
05490b8cb9
@@ -15,7 +15,7 @@ Most interaction with deep learning frameworks is isolated to the `PolicyGraph i
|
||||
Policy Evaluation
|
||||
-----------------
|
||||
|
||||
Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithm.
|
||||
Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
|
||||
|
||||
You can also use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvalutor.as_remote()``).
|
||||
|
||||
|
||||
@@ -75,3 +75,53 @@ Similarly, custom preprocessors should subclass the RLlib `preprocessor class <h
|
||||
"custom_options": {}, # extra options to pass to your preprocessor
|
||||
},
|
||||
})
|
||||
|
||||
|
||||
Customizing Policy Graphs
|
||||
-------------------------
|
||||
|
||||
For deeper customization of algorithms, you can modify the policy graphs of the agent classes. Here's an example of extending the DDPG policy graph to specify custom sub-network modules:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.agents.ddpg.ddpg_policy_graph import DDPGPolicyGraph as BaseDDPGPolicyGraph
|
||||
|
||||
class CustomPNetwork(object):
|
||||
def __init__(self, dim_actions, hiddens, activation):
|
||||
action_out = ...
|
||||
# Use sigmoid layer to bound values within (0, 1)
|
||||
# shape of action_scores is [batch_size, dim_actions]
|
||||
self.action_scores = layers.fully_connected(
|
||||
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
|
||||
|
||||
class CustomQNetwork(object):
|
||||
def __init__(self, action_inputs, hiddens, activation):
|
||||
q_out = ...
|
||||
self.value = layers.fully_connected(
|
||||
q_out, num_outputs=1, activation_fn=None)
|
||||
|
||||
class CustomDDPGPolicyGraph(BaseDDPGPolicyGraph):
|
||||
def _build_p_network(self, obs):
|
||||
return CustomPNetwork(
|
||||
self.dim_actions,
|
||||
self.config["actor_hiddens"],
|
||||
self.config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_q_network(self, obs, actions):
|
||||
return CustomQNetwork(
|
||||
actions,
|
||||
self.config["critic_hiddens"],
|
||||
self.config["critic_hidden_activation"]).value
|
||||
|
||||
Then, you can create an agent with your custom policy graph by:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGAgent
|
||||
from custom_policy_graph import CustomDDPGPolicyGraph
|
||||
|
||||
DDPGAgent._policy_graph = CustomDDPGPolicyGraph
|
||||
agent = DDPGAgent(...)
|
||||
|
||||
That's it. In this example we overrode existing methods of the existing DDPG policy graph, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely.
|
||||
|
||||
@@ -56,6 +56,7 @@ Models and Preprocessors
|
||||
* `Built-in Models and Preprocessors <rllib-models.html#built-in-models-and-preprocessors>`__
|
||||
* `Custom Models <rllib-models.html#custom-models>`__
|
||||
* `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
|
||||
* `Customizing Policy Graphs <rllib-models.html#customizing-policy-graphs>`__
|
||||
|
||||
RLlib Concepts
|
||||
--------------
|
||||
|
||||
@@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
dim_actions = action_space.shape[0]
|
||||
low_action = action_space.low
|
||||
high_action = action_space.high
|
||||
self.dim_actions = action_space.shape[0]
|
||||
self.low_action = action_space.low
|
||||
self.high_action = action_space.high
|
||||
self.actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["actor_lr"])
|
||||
self.critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["critic_lr"])
|
||||
|
||||
def _build_q_network(obs, actions):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), actions,
|
||||
config["critic_hiddens"],
|
||||
config["critic_hidden_activation"]).value
|
||||
|
||||
def _build_p_network(obs):
|
||||
return PNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
|
||||
config["actor_hiddens"],
|
||||
config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_action_network(p_values, stochastic, eps):
|
||||
return ActionNetwork(p_values, low_action, high_action, stochastic,
|
||||
eps, config["exploration_theta"],
|
||||
config["exploration_sigma"]).actions
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
@@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# Actor: P (policy) network
|
||||
with tf.variable_scope(P_SCOPE) as scope:
|
||||
p_values = _build_p_network(self.cur_observations)
|
||||
p_values = self._build_p_network(self.cur_observations)
|
||||
self.p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE):
|
||||
self.output_actions = _build_action_network(
|
||||
self.output_actions = self._build_action_network(
|
||||
p_values, self.stochastic, self.eps)
|
||||
|
||||
with tf.variable_scope(A_SCOPE, reuse=True):
|
||||
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
|
||||
self.reset_noise_op = tf.assign(exploration_sample,
|
||||
dim_actions * [.0])
|
||||
self.dim_actions * [.0])
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
@@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# p network evaluation
|
||||
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
|
||||
self.p_t = _build_p_network(self.obs_t)
|
||||
self.p_t = self._build_p_network(self.obs_t)
|
||||
|
||||
# target p network evaluation
|
||||
with tf.variable_scope(P_TARGET_SCOPE) as scope:
|
||||
p_tp1 = _build_p_network(self.obs_tp1)
|
||||
p_tp1 = self._build_p_network(self.obs_tp1)
|
||||
target_p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE, reuse=True):
|
||||
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
|
||||
zero_eps = tf.constant(value=.0, dtype=tf.float32)
|
||||
output_actions = _build_action_network(
|
||||
output_actions = self._build_action_network(
|
||||
self.p_t, deterministic_flag, zero_eps)
|
||||
|
||||
output_actions_estimated = _build_action_network(
|
||||
output_actions_estimated = self._build_action_network(
|
||||
p_tp1, deterministic_flag, zero_eps)
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_t = _build_q_network(self.obs_t, self.act_t)
|
||||
q_t = self._build_q_network(self.obs_t, self.act_t)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp0 = _build_q_network(self.obs_t, output_actions)
|
||||
q_tp0 = self._build_q_network(self.obs_t, output_actions)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
|
||||
q_tp1 = self._build_q_network(self.obs_tp1,
|
||||
output_actions_estimated)
|
||||
target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
self.loss = ActorCriticLoss(
|
||||
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
|
||||
self.done_mask, config["gamma"], config["n_step"],
|
||||
config["use_huber"], config["huber_threshold"])
|
||||
self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
|
||||
|
||||
if config["l2_reg"] is not None:
|
||||
for var in self.p_func_vars:
|
||||
@@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
def _build_q_network(self, obs, actions):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
|
||||
self.config["critic_hiddens"],
|
||||
self.config["critic_hidden_activation"]).value
|
||||
|
||||
def _build_p_network(self, obs):
|
||||
return PNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]),
|
||||
self.dim_actions, self.config["actor_hiddens"],
|
||||
self.config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_action_network(self, p_values, stochastic, eps):
|
||||
return ActionNetwork(p_values, self.low_action, self.high_action,
|
||||
stochastic, eps, self.config["exploration_theta"],
|
||||
self.config["exploration_sigma"]).actions
|
||||
|
||||
def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0):
|
||||
return ActorCriticLoss(
|
||||
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
|
||||
self.done_mask, self.config["gamma"], self.config["n_step"],
|
||||
self.config["use_huber"], self.config["huber_threshold"])
|
||||
|
||||
def gradients(self, optimizer):
|
||||
if self.config["grad_norm_clipping"] is not None:
|
||||
actor_grads_and_vars = _minimize_and_clip(
|
||||
|
||||
@@ -71,7 +71,6 @@ class QLoss(object):
|
||||
done_mask,
|
||||
gamma=0.99,
|
||||
n_step=1):
|
||||
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
@@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
num_actions = action_space.n
|
||||
|
||||
def _build_q_network(obs):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
|
||||
config["dueling"], config["hiddens"]).value
|
||||
self.num_actions = action_space.n
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
@@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# Action Q network
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_values = _build_q_network(self.cur_observations)
|
||||
q_values = self._build_q_network(self.cur_observations)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
self.output_actions = QValuePolicy(q_values, self.cur_observations,
|
||||
num_actions, self.stochastic,
|
||||
self.eps).action
|
||||
self.output_actions = self._build_q_value_policy(q_values)
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
@@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_t = _build_q_network(self.obs_t)
|
||||
q_t = self._build_q_network(self.obs_t)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = _build_q_network(self.obs_tp1)
|
||||
q_tp1 = self._build_q_network(self.obs_tp1)
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
|
||||
1)
|
||||
q_t_selected = tf.reduce_sum(
|
||||
q_t * tf.one_hot(self.act_t, self.num_actions), 1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if config["double_q"]:
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
|
||||
q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best = tf.reduce_sum(
|
||||
q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
|
||||
1)
|
||||
q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
|
||||
self.num_actions), 1)
|
||||
else:
|
||||
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
||||
|
||||
self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
|
||||
self.rew_t, self.done_mask, config["gamma"],
|
||||
config["n_step"])
|
||||
self.loss = self._build_q_loss(q_t_selected, q_tp1_best)
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
@@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
loss_inputs=self.loss_inputs)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def _build_q_network(self, obs):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1,
|
||||
self.config["model"]), self.num_actions,
|
||||
self.config["dueling"], self.config["hiddens"]).value
|
||||
|
||||
def _build_q_value_policy(self, q_values):
|
||||
return QValuePolicy(q_values, self.cur_observations, self.num_actions,
|
||||
self.stochastic, self.eps).action
|
||||
|
||||
def _build_q_loss(self, q_t_selected, q_tp1_best):
|
||||
return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
|
||||
self.rew_t, self.done_mask, self.config["gamma"],
|
||||
self.config["n_step"])
|
||||
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user