From 05490b8cb9b321ed12e4e3e7df75545957a8766f Mon Sep 17 00:00:00 2001
From: Sergey Kolesnikov <scitator@gmail.com>
Date: Mon, 23 Jul 2018 00:47:14 +0300
Subject: [PATCH] [rllib] dqn/ddpg policy customization (#2445)

* dqn policy update - more customization

* docs for custom DQN graph

* Update rllib-training.rst

* Update rllib-models.rst

* Update rllib.rst

* Update rllib-training.rst

* Update rllib-concepts.rst

* yapf codestyle
---
 doc/source/rllib-concepts.rst                 |  2 +-
 doc/source/rllib-models.rst                   | 50 +++++++++++++
 doc/source/rllib.rst                          |  1 +
 .../rllib/agents/ddpg/ddpg_policy_graph.py    | 72 ++++++++++---------
 .../ray/rllib/agents/dqn/dqn_policy_graph.py  | 47 ++++++------
 5 files changed, 116 insertions(+), 56 deletions(-)

diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index dc8f50688..f752279cb 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -15,7 +15,7 @@ Most interaction with deep learning frameworks is isolated to the `PolicyGraph i
 Policy Evaluation
 -----------------
 
-Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithm.
+Given an environment and policy graph, policy evaluation produces `batches <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/sample_batch.py>`__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator <https://github.com/ray-project/ray/blob/master/python/ray/rllib/evaluation/policy_evaluator.py>`__ class that manages all of this, and this class is used in most RLlib algorithms.
 
 You can also use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvalutor.as_remote()``).
 
diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index 938824ec5..098cfb6ec 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -75,3 +75,53 @@ Similarly, custom preprocessors should subclass the RLlib `preprocessor class <h
             "custom_options": {},  # extra options to pass to your preprocessor
         },
     })
+
+
+Customizing Policy Graphs
+-------------------------
+
+For deeper customization of algorithms, you can modify the policy graphs of the agent classes. Here's an example of extending the DDPG policy graph to specify custom sub-network modules:
+
+.. code-block:: python
+
+    from ray.rllib.models import ModelCatalog
+    from ray.rllib.agents.ddpg.ddpg_policy_graph import DDPGPolicyGraph as BaseDDPGPolicyGraph
+
+    class CustomPNetwork(object):
+        def __init__(self, dim_actions, hiddens, activation):
+            action_out = ...
+            # Use sigmoid layer to bound values within (0, 1)
+            # shape of action_scores is [batch_size, dim_actions]
+            self.action_scores = layers.fully_connected(
+                action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
+
+    class CustomQNetwork(object):
+        def __init__(self, action_inputs, hiddens, activation):
+            q_out = ...
+            self.value = layers.fully_connected(
+                q_out, num_outputs=1, activation_fn=None)
+
+    class CustomDDPGPolicyGraph(BaseDDPGPolicyGraph):
+        def _build_p_network(self, obs):
+            return CustomPNetwork(
+                self.dim_actions,
+                self.config["actor_hiddens"],
+                self.config["actor_hidden_activation"]).action_scores
+
+        def _build_q_network(self, obs, actions):
+            return CustomQNetwork(
+                actions,
+                self.config["critic_hiddens"],
+                self.config["critic_hidden_activation"]).value
+
+Then, you can create an agent with your custom policy graph by:
+
+.. code-block:: python
+
+    from ray.rllib.agents.ddpg.ddpg import DDPGAgent
+    from custom_policy_graph import CustomDDPGPolicyGraph
+
+    DDPGAgent._policy_graph = CustomDDPGPolicyGraph
+    agent = DDPGAgent(...)
+
+That's it. In this example we overrode existing methods of the existing DDPG policy graph, i.e., `_build_q_network`, `_build_p_network`, `_build_action_network`, `_build_actor_critic_loss`, but you can also replace the entire graph class entirely.
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 1e1d0d915..5f9d94681 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -56,6 +56,7 @@ Models and Preprocessors
 * `Built-in Models and Preprocessors <rllib-models.html#built-in-models-and-preprocessors>`__
 * `Custom Models <rllib-models.html#custom-models>`__
 * `Custom Preprocessors <rllib-models.html#custom-preprocessors>`__
+* `Customizing Policy Graphs <rllib-models.html#customizing-policy-graphs>`__
 
 RLlib Concepts
 --------------
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index ceae0d0f0..a6f26885f 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
 
         self.config = config
         self.cur_epsilon = 1.0
-        dim_actions = action_space.shape[0]
-        low_action = action_space.low
-        high_action = action_space.high
+        self.dim_actions = action_space.shape[0]
+        self.low_action = action_space.low
+        self.high_action = action_space.high
         self.actor_optimizer = tf.train.AdamOptimizer(
             learning_rate=config["actor_lr"])
         self.critic_optimizer = tf.train.AdamOptimizer(
             learning_rate=config["critic_lr"])
 
-        def _build_q_network(obs, actions):
-            return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]), actions,
-                config["critic_hiddens"],
-                config["critic_hidden_activation"]).value
-
-        def _build_p_network(obs):
-            return PNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
-                config["actor_hiddens"],
-                config["actor_hidden_activation"]).action_scores
-
-        def _build_action_network(p_values, stochastic, eps):
-            return ActionNetwork(p_values, low_action, high_action, stochastic,
-                                 eps, config["exploration_theta"],
-                                 config["exploration_sigma"]).actions
-
         # Action inputs
         self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
         self.eps = tf.placeholder(tf.float32, (), name="eps")
@@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph):
 
         # Actor: P (policy) network
         with tf.variable_scope(P_SCOPE) as scope:
-            p_values = _build_p_network(self.cur_observations)
+            p_values = self._build_p_network(self.cur_observations)
             self.p_func_vars = _scope_vars(scope.name)
 
         # Action outputs
         with tf.variable_scope(A_SCOPE):
-            self.output_actions = _build_action_network(
+            self.output_actions = self._build_action_network(
                 p_values, self.stochastic, self.eps)
 
         with tf.variable_scope(A_SCOPE, reuse=True):
             exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
             self.reset_noise_op = tf.assign(exploration_sample,
-                                            dim_actions * [.0])
+                                            self.dim_actions * [.0])
 
         # Replay inputs
         self.obs_t = tf.placeholder(
@@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph):
 
         # p network evaluation
         with tf.variable_scope(P_SCOPE, reuse=True) as scope:
-            self.p_t = _build_p_network(self.obs_t)
+            self.p_t = self._build_p_network(self.obs_t)
 
         # target p network evaluation
         with tf.variable_scope(P_TARGET_SCOPE) as scope:
-            p_tp1 = _build_p_network(self.obs_tp1)
+            p_tp1 = self._build_p_network(self.obs_tp1)
             target_p_func_vars = _scope_vars(scope.name)
 
         # Action outputs
         with tf.variable_scope(A_SCOPE, reuse=True):
             deterministic_flag = tf.constant(value=False, dtype=tf.bool)
             zero_eps = tf.constant(value=.0, dtype=tf.float32)
-            output_actions = _build_action_network(
+            output_actions = self._build_action_network(
                 self.p_t, deterministic_flag, zero_eps)
 
-            output_actions_estimated = _build_action_network(
+            output_actions_estimated = self._build_action_network(
                 p_tp1, deterministic_flag, zero_eps)
 
         # q network evaluation
         with tf.variable_scope(Q_SCOPE) as scope:
-            q_t = _build_q_network(self.obs_t, self.act_t)
+            q_t = self._build_q_network(self.obs_t, self.act_t)
             self.q_func_vars = _scope_vars(scope.name)
         with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_tp0 = _build_q_network(self.obs_t, output_actions)
+            q_tp0 = self._build_q_network(self.obs_t, output_actions)
 
         # target q network evalution
         with tf.variable_scope(Q_TARGET_SCOPE) as scope:
-            q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
+            q_tp1 = self._build_q_network(self.obs_tp1,
+                                          output_actions_estimated)
             target_q_func_vars = _scope_vars(scope.name)
 
-        self.loss = ActorCriticLoss(
-            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
-            self.done_mask, config["gamma"], config["n_step"],
-            config["use_huber"], config["huber_threshold"])
+        self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
 
         if config["l2_reg"] is not None:
             for var in self.p_func_vars:
@@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
         # Hard initial update
         self.update_target(tau=1.0)
 
+    def _build_q_network(self, obs, actions):
+        return QNetwork(
+            ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
+            self.config["critic_hiddens"],
+            self.config["critic_hidden_activation"]).value
+
+    def _build_p_network(self, obs):
+        return PNetwork(
+            ModelCatalog.get_model(obs, 1, self.config["model"]),
+            self.dim_actions, self.config["actor_hiddens"],
+            self.config["actor_hidden_activation"]).action_scores
+
+    def _build_action_network(self, p_values, stochastic, eps):
+        return ActionNetwork(p_values, self.low_action, self.high_action,
+                             stochastic, eps, self.config["exploration_theta"],
+                             self.config["exploration_sigma"]).actions
+
+    def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0):
+        return ActorCriticLoss(
+            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
+            self.done_mask, self.config["gamma"], self.config["n_step"],
+            self.config["use_huber"], self.config["huber_threshold"])
+
     def gradients(self, optimizer):
         if self.config["grad_norm_clipping"] is not None:
             actor_grads_and_vars = _minimize_and_clip(
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index f553ad325..6dccdede1 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -71,7 +71,6 @@ class QLoss(object):
                  done_mask,
                  gamma=0.99,
                  n_step=1):
-
         q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
 
         # compute RHS of bellman equation
@@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph):
 
         self.config = config
         self.cur_epsilon = 1.0
-        num_actions = action_space.n
-
-        def _build_q_network(obs):
-            return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
-                config["dueling"], config["hiddens"]).value
+        self.num_actions = action_space.n
 
         # Action inputs
         self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph):
 
         # Action Q network
         with tf.variable_scope(Q_SCOPE) as scope:
-            q_values = _build_q_network(self.cur_observations)
+            q_values = self._build_q_network(self.cur_observations)
             self.q_func_vars = _scope_vars(scope.name)
 
         # Action outputs
-        self.output_actions = QValuePolicy(q_values, self.cur_observations,
-                                           num_actions, self.stochastic,
-                                           self.eps).action
+        self.output_actions = self._build_q_value_policy(q_values)
 
         # Replay inputs
         self.obs_t = tf.placeholder(
@@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph):
 
         # q network evaluation
         with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_t = _build_q_network(self.obs_t)
+            q_t = self._build_q_network(self.obs_t)
 
         # target q network evalution
         with tf.variable_scope(Q_TARGET_SCOPE) as scope:
-            q_tp1 = _build_q_network(self.obs_tp1)
+            q_tp1 = self._build_q_network(self.obs_tp1)
             self.target_q_func_vars = _scope_vars(scope.name)
 
         # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
-                                     1)
+        q_t_selected = tf.reduce_sum(
+            q_t * tf.one_hot(self.act_t, self.num_actions), 1)
 
         # compute estimate of best possible value starting from state at t + 1
         if config["double_q"]:
             with tf.variable_scope(Q_SCOPE, reuse=True):
-                q_tp1_using_online_net = _build_q_network(self.obs_tp1)
+                q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
             q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
             q_tp1_best = tf.reduce_sum(
-                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
-                1)
+                q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
+                                   self.num_actions), 1)
         else:
             q_tp1_best = tf.reduce_max(q_tp1, 1)
 
-        self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
-                          self.rew_t, self.done_mask, config["gamma"],
-                          config["n_step"])
+        self.loss = self._build_q_loss(q_t_selected, q_tp1_best)
 
         # update_target_fn will be called periodically to copy Q network to
         # target Q network
@@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph):
             loss_inputs=self.loss_inputs)
         self.sess.run(tf.global_variables_initializer())
 
+    def _build_q_network(self, obs):
+        return QNetwork(
+            ModelCatalog.get_model(obs, 1,
+                                   self.config["model"]), self.num_actions,
+            self.config["dueling"], self.config["hiddens"]).value
+
+    def _build_q_value_policy(self, q_values):
+        return QValuePolicy(q_values, self.cur_observations, self.num_actions,
+                            self.stochastic, self.eps).action
+
+    def _build_q_loss(self, q_t_selected, q_tp1_best):
+        return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
+                     self.rew_t, self.done_mask, self.config["gamma"],
+                     self.config["n_step"])
+
     def optimizer(self):
         return tf.train.AdamOptimizer(learning_rate=self.config["lr"])