[rllib] dqn/ddpg policy customization (#2445)

* dqn policy update - more customization

* docs for custom DQN graph

* Update rllib-training.rst

* Update rllib-models.rst

* Update rllib.rst

* Update rllib-training.rst

* Update rllib-concepts.rst

* yapf codestyle
This commit is contained in:
Sergey Kolesnikov
2018-07-23 00:47:14 +03:00
committed by Eric Liang
parent 68660453e4
commit 05490b8cb9
5 changed files with 116 additions and 56 deletions
+1 -1
View File
@@ -15,7 +15,7 @@ Most interaction with deep learning frameworks is isolated to the `PolicyGraph i
Policy Evaluation
-----------------
Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithm.
Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
You can also use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvalutor.as_remote()``).
+50
View File
@@ -75,3 +75,53 @@ Similarly, custom preprocessors should subclass the RLlib `preprocessor class <h
"custom_options": {}, # extra options to pass to your preprocessor
},
})
Customizing Policy Graphs
-------------------------
For deeper customization of algorithms, you can modify the policy graphs of the agent classes. Here's an example of extending the DDPG policy graph to specify custom sub-network modules:
.. code-block:: python
from ray.rllib.models import ModelCatalog
from ray.rllib.agents.ddpg.ddpg_policy_graph import DDPGPolicyGraph as BaseDDPGPolicyGraph
class CustomPNetwork(object):
def __init__(self, dim_actions, hiddens, activation):
action_out = ...
# Use sigmoid layer to bound values within (0, 1)
# shape of action_scores is [batch_size, dim_actions]
self.action_scores = layers.fully_connected(
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
class CustomQNetwork(object):
def __init__(self, action_inputs, hiddens, activation):
q_out = ...
self.value = layers.fully_connected(
q_out, num_outputs=1, activation_fn=None)
class CustomDDPGPolicyGraph(BaseDDPGPolicyGraph):
def _build_p_network(self, obs):
return CustomPNetwork(
self.dim_actions,
self.config["actor_hiddens"],
self.config["actor_hidden_activation"]).action_scores
def _build_q_network(self, obs, actions):
return CustomQNetwork(
actions,
self.config["critic_hiddens"],
self.config["critic_hidden_activation"]).value
Then, you can create an agent with your custom policy graph by:
.. code-block:: python
from ray.rllib.agents.ddpg.ddpg import DDPGAgent
from custom_policy_graph import CustomDDPGPolicyGraph
DDPGAgent._policy_graph = CustomDDPGPolicyGraph
agent = DDPGAgent(...)
That's it. In this example we overrode existing methods of the existing DDPG policy graph, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely.
+1
View File
@@ -56,6 +56,7 @@ Models and Preprocessors
* `Built-in Models and Preprocessors <rllib-models.html#built-in-models-and-preprocessors>`__
* `Custom Models <rllib-models.html#custom-models>`__
* `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
* `Customizing Policy Graphs <rllib-models.html#customizing-policy-graphs>`__
RLlib Concepts
--------------
@@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
self.config = config
self.cur_epsilon = 1.0
dim_actions = action_space.shape[0]
low_action = action_space.low
high_action = action_space.high
self.dim_actions = action_space.shape[0]
self.low_action = action_space.low
self.high_action = action_space.high
self.actor_optimizer = tf.train.AdamOptimizer(
learning_rate=config["actor_lr"])
self.critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["critic_lr"])
def _build_q_network(obs, actions):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]), actions,
config["critic_hiddens"],
config["critic_hidden_activation"]).value
def _build_p_network(obs):
return PNetwork(
ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
config["actor_hiddens"],
config["actor_hidden_activation"]).action_scores
def _build_action_network(p_values, stochastic, eps):
return ActionNetwork(p_values, low_action, high_action, stochastic,
eps, config["exploration_theta"],
config["exploration_sigma"]).actions
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
@@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph):
# Actor: P (policy) network
with tf.variable_scope(P_SCOPE) as scope:
p_values = _build_p_network(self.cur_observations)
p_values = self._build_p_network(self.cur_observations)
self.p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE):
self.output_actions = _build_action_network(
self.output_actions = self._build_action_network(
p_values, self.stochastic, self.eps)
with tf.variable_scope(A_SCOPE, reuse=True):
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
self.reset_noise_op = tf.assign(exploration_sample,
dim_actions * [.0])
self.dim_actions * [.0])
# Replay inputs
self.obs_t = tf.placeholder(
@@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph):
# p network evaluation
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
self.p_t = _build_p_network(self.obs_t)
self.p_t = self._build_p_network(self.obs_t)
# target p network evaluation
with tf.variable_scope(P_TARGET_SCOPE) as scope:
p_tp1 = _build_p_network(self.obs_tp1)
p_tp1 = self._build_p_network(self.obs_tp1)
target_p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE, reuse=True):
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
zero_eps = tf.constant(value=.0, dtype=tf.float32)
output_actions = _build_action_network(
output_actions = self._build_action_network(
self.p_t, deterministic_flag, zero_eps)
output_actions_estimated = _build_action_network(
output_actions_estimated = self._build_action_network(
p_tp1, deterministic_flag, zero_eps)
# q network evaluation
with tf.variable_scope(Q_SCOPE) as scope:
q_t = _build_q_network(self.obs_t, self.act_t)
q_t = self._build_q_network(self.obs_t, self.act_t)
self.q_func_vars = _scope_vars(scope.name)
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp0 = _build_q_network(self.obs_t, output_actions)
q_tp0 = self._build_q_network(self.obs_t, output_actions)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
q_tp1 = self._build_q_network(self.obs_tp1,
output_actions_estimated)
target_q_func_vars = _scope_vars(scope.name)
self.loss = ActorCriticLoss(
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
self.done_mask, config["gamma"], config["n_step"],
config["use_huber"], config["huber_threshold"])
self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
if config["l2_reg"] is not None:
for var in self.p_func_vars:
@@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
# Hard initial update
self.update_target(tau=1.0)
def _build_q_network(self, obs, actions):
return QNetwork(
ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
self.config["critic_hiddens"],
self.config["critic_hidden_activation"]).value
def _build_p_network(self, obs):
return PNetwork(
ModelCatalog.get_model(obs, 1, self.config["model"]),
self.dim_actions, self.config["actor_hiddens"],
self.config["actor_hidden_activation"]).action_scores
def _build_action_network(self, p_values, stochastic, eps):
return ActionNetwork(p_values, self.low_action, self.high_action,
stochastic, eps, self.config["exploration_theta"],
self.config["exploration_sigma"]).actions
def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0):
return ActorCriticLoss(
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
self.done_mask, self.config["gamma"], self.config["n_step"],
self.config["use_huber"], self.config["huber_threshold"])
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = _minimize_and_clip(
+26 -21
View File
@@ -71,7 +71,6 @@ class QLoss(object):
done_mask,
gamma=0.99,
n_step=1):
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
@@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph):
self.config = config
self.cur_epsilon = 1.0
num_actions = action_space.n
def _build_q_network(obs):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
config["dueling"], config["hiddens"]).value
self.num_actions = action_space.n
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph):
# Action Q network
with tf.variable_scope(Q_SCOPE) as scope:
q_values = _build_q_network(self.cur_observations)
q_values = self._build_q_network(self.cur_observations)
self.q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = QValuePolicy(q_values, self.cur_observations,
num_actions, self.stochastic,
self.eps).action
self.output_actions = self._build_q_value_policy(q_values)
# Replay inputs
self.obs_t = tf.placeholder(
@@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph):
# q network evaluation
with tf.variable_scope(Q_SCOPE, reuse=True):
q_t = _build_q_network(self.obs_t)
q_t = self._build_q_network(self.obs_t)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(self.obs_tp1)
q_tp1 = self._build_q_network(self.obs_tp1)
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
1)
q_t_selected = tf.reduce_sum(
q_t * tf.one_hot(self.act_t, self.num_actions), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
1)
q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
self.num_actions), 1)
else:
q_tp1_best = tf.reduce_max(q_tp1, 1)
self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, config["gamma"],
config["n_step"])
self.loss = self._build_q_loss(q_t_selected, q_tp1_best)
# update_target_fn will be called periodically to copy Q network to
# target Q network
@@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph):
loss_inputs=self.loss_inputs)
self.sess.run(tf.global_variables_initializer())
def _build_q_network(self, obs):
return QNetwork(
ModelCatalog.get_model(obs, 1,
self.config["model"]), self.num_actions,
self.config["dueling"], self.config["hiddens"]).value
def _build_q_value_policy(self, q_values):
return QValuePolicy(q_values, self.cur_observations, self.num_actions,
self.stochastic, self.eps).action
def _build_q_loss(self, q_t_selected, q_tp1_best):
return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, self.config["gamma"],
self.config["n_step"])
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])