From 03fe760616a70e7255f944168265b52efed27c6e Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 4 Jan 2019 22:30:35 -0800
Subject: [PATCH] [rllib] Model self loss isn't included in all algorithms
 (#3679)

---
 doc/source/rllib-offline.rst                  |  4 +--
 doc/source/rllib.rst                          | 12 ++++----
 .../rllib/agents/a3c/a3c_tf_policy_graph.py   |  2 +-
 .../rllib/agents/ddpg/ddpg_policy_graph.py    | 30 +++++++++++--------
 .../ray/rllib/agents/dqn/dqn_policy_graph.py  |  2 +-
 .../agents/impala/vtrace_policy_graph.py      |  2 +-
 .../rllib/tuned_examples/pendulum-ddpg.yaml   |  2 +-
 7 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/doc/source/rllib-offline.rst b/doc/source/rllib-offline.rst
index 00a6f08b7..4742d9b15 100644
--- a/doc/source/rllib-offline.rst
+++ b/doc/source/rllib-offline.rst
@@ -1,5 +1,5 @@
-RLlib Offline Data Input / Output
-=================================
+RLlib Offline Datasets
+======================
 
 Working with Offline Datasets
 -----------------------------
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 2e4c249fe..1bd20ad29 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -83,22 +83,22 @@ Models and Preprocessors
 * `Variable-length / Parametric Action Spaces <rllib-models.html#variable-length-parametric-action-spaces>`__
 * `Model-Based Rollouts <rllib-models.html#model-based-rollouts>`__
 
-Offline Data Input / Output
----------------------------
+Offline Datasets
+----------------
 * `Working with Offline Datasets <rllib-offline.html>`__
 * `Input API <rllib-offline.html#input-api>`__
 * `Output API <rllib-offline.html#output-api>`__
 
-RLlib Development
------------------
+Development
+-----------
 
 * `Development Install <rllib-dev.html#development-install>`__
 * `Features <rllib-dev.html#feature-development>`__
 * `Benchmarks <rllib-dev.html#benchmarks>`__
 * `Contributing Algorithms <rllib-dev.html#contributing-algorithms>`__
 
-RLlib Concepts
---------------
+Concepts
+--------
 * `Policy Graphs <rllib-concepts.html>`__
 * `Policy Evaluation <rllib-concepts.html#policy-evaluation>`__
 * `Policy Optimization <rllib-concepts.html#policy-optimization>`__
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
index 50258f58a..90ba87dae 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py
@@ -142,7 +142,7 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
 
     @override(TFPolicyGraph)
     def gradients(self, optimizer):
-        grads = tf.gradients(self.loss.total_loss, self.var_list)
+        grads = tf.gradients(self._loss, self.var_list)
         self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
         clipped_grads = list(zip(self.grads, self.var_list))
         return clipped_grads
diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
index b8b625734..ca3fae559 100644
--- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
+++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py
@@ -40,6 +40,7 @@ class PNetwork(object):
         # shape of action_scores is [batch_size, dim_actions]
         self.action_scores = layers.fully_connected(
             action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
+        self.model = model
 
 
 class ActionNetwork(object):
@@ -177,8 +178,6 @@ class ActorCriticLoss(object):
         self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask *
                            tf.reduce_mean(q_tp0))
 
-        self.total_loss = self.actor_loss + self.critic_loss
-
 
 class DDPGPolicyGraph(TFPolicyGraph):
     def __init__(self, observation_space, action_space, config):
@@ -207,8 +206,8 @@ class DDPGPolicyGraph(TFPolicyGraph):
 
         # Actor: P (policy) network
         with tf.variable_scope(P_SCOPE) as scope:
-            p_values = self._build_p_network(self.cur_observations,
-                                             observation_space)
+            p_values, self.p_model = self._build_p_network(
+                self.cur_observations, observation_space)
             self.p_func_vars = _scope_vars(scope.name)
 
         # Action outputs
@@ -241,14 +240,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
         # p network evaluation
         with tf.variable_scope(P_SCOPE, reuse=True) as scope:
             prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-            self.p_t = self._build_p_network(self.obs_t, observation_space)
+            self.p_t, _ = self._build_p_network(self.obs_t, observation_space)
             p_batchnorm_update_ops = list(
                 set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
                 prev_update_ops)
 
         # target p network evaluation
         with tf.variable_scope(P_TARGET_SCOPE) as scope:
-            p_tp1 = self._build_p_network(self.obs_tp1, observation_space)
+            p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space)
             target_p_func_vars = _scope_vars(scope.name)
 
         # Action outputs
@@ -267,15 +266,15 @@ class DDPGPolicyGraph(TFPolicyGraph):
         # q network evaluation
         prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
         with tf.variable_scope(Q_SCOPE) as scope:
-            q_t, model = self._build_q_network(self.obs_t, observation_space,
-                                               self.act_t)
+            q_t, self.q_model = self._build_q_network(
+                self.obs_t, observation_space, self.act_t)
             self.q_func_vars = _scope_vars(scope.name)
         with tf.variable_scope(Q_SCOPE, reuse=True):
             q_tp0, _ = self._build_q_network(self.obs_t, observation_space,
                                              output_actions)
         if self.config["twin_q"]:
             with tf.variable_scope(TWIN_Q_SCOPE) as scope:
-                twin_q_t, twin_model = self._build_q_network(
+                twin_q_t, self.twin_q_model = self._build_q_network(
                     self.obs_t, observation_space, self.act_t)
                 self.twin_q_func_vars = _scope_vars(scope.name)
         q_batchnorm_update_ops = list(
@@ -313,6 +312,12 @@ class DDPGPolicyGraph(TFPolicyGraph):
                         self.loss.critic_loss += (
                             config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
 
+        # Model self-supervised losses
+        self.loss.actor_loss += self.p_model.loss()
+        self.loss.critic_loss += self.q_model.loss()
+        if self.config["twin_q"]:
+            self.loss.critic_loss += self.twin_q_model.loss()
+
         # update_target_fn will be called periodically to copy Q network to
         # target Q network
         self.tau_value = config.get("tau")
@@ -355,7 +360,7 @@ class DDPGPolicyGraph(TFPolicyGraph):
             self.sess,
             obs_input=self.cur_observations,
             action_sampler=self.output_actions,
-            loss=model.loss() + self.loss.total_loss,
+            loss=self.loss.actor_loss + self.loss.critic_loss,
             loss_inputs=self.loss_inputs,
             update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops)
         self.sess.run(tf.global_variables_initializer())
@@ -448,13 +453,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
         return q_net.value, q_net.model
 
     def _build_p_network(self, obs, obs_space):
-        return PNetwork(
+        policy_net = PNetwork(
             ModelCatalog.get_model({
                 "obs": obs,
                 "is_training": self._get_is_training_placeholder(),
             }, obs_space, 1, self.config["model"]), self.dim_actions,
             self.config["actor_hiddens"],
-            self.config["actor_hidden_activation"]).action_scores
+            self.config["actor_hidden_activation"])
+        return policy_net.action_scores, policy_net.model
 
     def _build_action_network(self, p_values, stochastic, eps,
                               is_target=False):
diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
index 625e577ff..f9e15299a 100644
--- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py
+++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py
@@ -403,7 +403,7 @@ class DQNPolicyGraph(TFPolicyGraph):
         if self.config["grad_norm_clipping"] is not None:
             grads_and_vars = _minimize_and_clip(
                 optimizer,
-                self.loss.loss,
+                self._loss,
                 var_list=self.q_func_vars,
                 clip_val=self.config["grad_norm_clipping"])
         else:
diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
index 12c0c30fb..d18dade5c 100644
--- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py
+++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py
@@ -263,7 +263,7 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
 
     @override(TFPolicyGraph)
     def gradients(self, optimizer):
-        grads = tf.gradients(self.loss.total_loss, self.var_list)
+        grads = tf.gradients(self._loss, self.var_list)
         self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
         clipped_grads = list(zip(self.grads, self.var_list))
         return clipped_grads
diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
index e28eee3e8..38b93ea72 100644
--- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
+++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
@@ -4,7 +4,7 @@ pendulum-ddpg:
     run: DDPG
     stop:
         episode_reward_mean: -160
-        time_total_s: 600 # 10 minutes
+        timesteps_total: 100000
     config:
         # === Model ===
         actor_hiddens: [64, 64]