From 03fe760616a70e7255f944168265b52efed27c6e Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 4 Jan 2019 22:30:35 -0800 Subject: [PATCH] [rllib] Model self loss isn't included in all algorithms (#3679) --- doc/source/rllib-offline.rst | 4 +-- doc/source/rllib.rst | 12 ++++---- .../rllib/agents/a3c/a3c_tf_policy_graph.py | 2 +- .../rllib/agents/ddpg/ddpg_policy_graph.py | 30 +++++++++++-------- .../ray/rllib/agents/dqn/dqn_policy_graph.py | 2 +- .../agents/impala/vtrace_policy_graph.py | 2 +- .../rllib/tuned_examples/pendulum-ddpg.yaml | 2 +- 7 files changed, 30 insertions(+), 24 deletions(-) diff --git a/doc/source/rllib-offline.rst b/doc/source/rllib-offline.rst index 00a6f08b7..4742d9b15 100644 --- a/doc/source/rllib-offline.rst +++ b/doc/source/rllib-offline.rst @@ -1,5 +1,5 @@ -RLlib Offline Data Input / Output -================================= +RLlib Offline Datasets +====================== Working with Offline Datasets ----------------------------- diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 2e4c249fe..1bd20ad29 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -83,22 +83,22 @@ Models and Preprocessors * `Variable-length / Parametric Action Spaces `__ * `Model-Based Rollouts `__ -Offline Data Input / Output ---------------------------- +Offline Datasets +---------------- * `Working with Offline Datasets `__ * `Input API `__ * `Output API `__ -RLlib Development ------------------ +Development +----------- * `Development Install `__ * `Features `__ * `Benchmarks `__ * `Contributing Algorithms `__ -RLlib Concepts --------------- +Concepts +-------- * `Policy Graphs `__ * `Policy Evaluation `__ * `Policy Optimization `__ diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py index 50258f58a..90ba87dae 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy_graph.py @@ -142,7 +142,7 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph): @override(TFPolicyGraph) def gradients(self, optimizer): - grads = tf.gradients(self.loss.total_loss, self.var_list) + grads = tf.gradients(self._loss, self.var_list) self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) clipped_grads = list(zip(self.grads, self.var_list)) return clipped_grads diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index b8b625734..ca3fae559 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -40,6 +40,7 @@ class PNetwork(object): # shape of action_scores is [batch_size, dim_actions] self.action_scores = layers.fully_connected( action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid) + self.model = model class ActionNetwork(object): @@ -177,8 +178,6 @@ class ActorCriticLoss(object): self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask * tf.reduce_mean(q_tp0)) - self.total_loss = self.actor_loss + self.critic_loss - class DDPGPolicyGraph(TFPolicyGraph): def __init__(self, observation_space, action_space, config): @@ -207,8 +206,8 @@ class DDPGPolicyGraph(TFPolicyGraph): # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: - p_values = self._build_p_network(self.cur_observations, - observation_space) + p_values, self.p_model = self._build_p_network( + self.cur_observations, observation_space) self.p_func_vars = _scope_vars(scope.name) # Action outputs @@ -241,14 +240,14 @@ class DDPGPolicyGraph(TFPolicyGraph): # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - self.p_t = self._build_p_network(self.obs_t, observation_space) + self.p_t, _ = self._build_p_network(self.obs_t, observation_space) p_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: - p_tp1 = self._build_p_network(self.obs_tp1, observation_space) + p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space) target_p_func_vars = _scope_vars(scope.name) # Action outputs @@ -267,15 +266,15 @@ class DDPGPolicyGraph(TFPolicyGraph): # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: - q_t, model = self._build_q_network(self.obs_t, observation_space, - self.act_t) + q_t, self.q_model = self._build_q_network( + self.obs_t, observation_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0, _ = self._build_q_network(self.obs_t, observation_space, output_actions) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: - twin_q_t, twin_model = self._build_q_network( + twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, self.act_t) self.twin_q_func_vars = _scope_vars(scope.name) q_batchnorm_update_ops = list( @@ -313,6 +312,12 @@ class DDPGPolicyGraph(TFPolicyGraph): self.loss.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) + # Model self-supervised losses + self.loss.actor_loss += self.p_model.loss() + self.loss.critic_loss += self.q_model.loss() + if self.config["twin_q"]: + self.loss.critic_loss += self.twin_q_model.loss() + # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") @@ -355,7 +360,7 @@ class DDPGPolicyGraph(TFPolicyGraph): self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, - loss=model.loss() + self.loss.total_loss, + loss=self.loss.actor_loss + self.loss.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) @@ -448,13 +453,14 @@ class DDPGPolicyGraph(TFPolicyGraph): return q_net.value, q_net.model def _build_p_network(self, obs, obs_space): - return PNetwork( + policy_net = PNetwork( ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], - self.config["actor_hidden_activation"]).action_scores + self.config["actor_hidden_activation"]) + return policy_net.action_scores, policy_net.model def _build_action_network(self, p_values, stochastic, eps, is_target=False): diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py index 625e577ff..f9e15299a 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py +++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py @@ -403,7 +403,7 @@ class DQNPolicyGraph(TFPolicyGraph): if self.config["grad_norm_clipping"] is not None: grads_and_vars = _minimize_and_clip( optimizer, - self.loss.loss, + self._loss, var_list=self.q_func_vars, clip_val=self.config["grad_norm_clipping"]) else: diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py index 12c0c30fb..d18dade5c 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py +++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py @@ -263,7 +263,7 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph): @override(TFPolicyGraph) def gradients(self, optimizer): - grads = tf.gradients(self.loss.total_loss, self.var_list) + grads = tf.gradients(self._loss, self.var_list) self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) clipped_grads = list(zip(self.grads, self.var_list)) return clipped_grads diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml index e28eee3e8..38b93ea72 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml @@ -4,7 +4,7 @@ pendulum-ddpg: run: DDPG stop: episode_reward_mean: -160 - time_total_s: 600 # 10 minutes + timesteps_total: 100000 config: # === Model === actor_hiddens: [64, 64]