[rllib] dqn/ddpg policy customization (#2445)

* dqn policy update - more customization * docs for custom DQN graph * Update rllib-training.rst * Update rllib-models.rst * Update rllib.rst * Update rllib-training.rst * Update rllib-concepts.rst * yapf codestyle
2026-06-28 03:18:59 +08:00 · 2018-07-23 00:47:14 +03:00
parent 68660453e4
commit 05490b8cb9
5 changed files with 116 additions and 56 deletions
@@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph):

        self.config = config
        self.cur_epsilon = 1.0
-        dim_actions = action_space.shape[0]
-        low_action = action_space.low
-        high_action = action_space.high
+        self.dim_actions = action_space.shape[0]
+        self.low_action = action_space.low
+        self.high_action = action_space.high
        self.actor_optimizer = tf.train.AdamOptimizer(
            learning_rate=config["actor_lr"])
        self.critic_optimizer = tf.train.AdamOptimizer(
            learning_rate=config["critic_lr"])

-        def _build_q_network(obs, actions):
-            return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]), actions,
-                config["critic_hiddens"],
-                config["critic_hidden_activation"]).value
-
-        def _build_p_network(obs):
-            return PNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
-                config["actor_hiddens"],
-                config["actor_hidden_activation"]).action_scores
-
-        def _build_action_network(p_values, stochastic, eps):
-            return ActionNetwork(p_values, low_action, high_action, stochastic,
-                                 eps, config["exploration_theta"],
-                                 config["exploration_sigma"]).actions
-
        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
        self.eps = tf.placeholder(tf.float32, (), name="eps")
@@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph):

        # Actor: P (policy) network
        with tf.variable_scope(P_SCOPE) as scope:
-            p_values = _build_p_network(self.cur_observations)
+            p_values = self._build_p_network(self.cur_observations)
            self.p_func_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(A_SCOPE):
-            self.output_actions = _build_action_network(
+            self.output_actions = self._build_action_network(
                p_values, self.stochastic, self.eps)

        with tf.variable_scope(A_SCOPE, reuse=True):
            exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
            self.reset_noise_op = tf.assign(exploration_sample,
-                                            dim_actions * [.0])
+                                            self.dim_actions * [.0])

        # Replay inputs
        self.obs_t = tf.placeholder(
@@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph):

        # p network evaluation
        with tf.variable_scope(P_SCOPE, reuse=True) as scope:
-            self.p_t = _build_p_network(self.obs_t)
+            self.p_t = self._build_p_network(self.obs_t)

        # target p network evaluation
        with tf.variable_scope(P_TARGET_SCOPE) as scope:
-            p_tp1 = _build_p_network(self.obs_tp1)
+            p_tp1 = self._build_p_network(self.obs_tp1)
            target_p_func_vars = _scope_vars(scope.name)

        # Action outputs
        with tf.variable_scope(A_SCOPE, reuse=True):
            deterministic_flag = tf.constant(value=False, dtype=tf.bool)
            zero_eps = tf.constant(value=.0, dtype=tf.float32)
-            output_actions = _build_action_network(
+            output_actions = self._build_action_network(
                self.p_t, deterministic_flag, zero_eps)

-            output_actions_estimated = _build_action_network(
+            output_actions_estimated = self._build_action_network(
                p_tp1, deterministic_flag, zero_eps)

        # q network evaluation
        with tf.variable_scope(Q_SCOPE) as scope:
-            q_t = _build_q_network(self.obs_t, self.act_t)
+            q_t = self._build_q_network(self.obs_t, self.act_t)
            self.q_func_vars = _scope_vars(scope.name)
        with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_tp0 = _build_q_network(self.obs_t, output_actions)
+            q_tp0 = self._build_q_network(self.obs_t, output_actions)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
-            q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
+            q_tp1 = self._build_q_network(self.obs_tp1,
+                                          output_actions_estimated)
            target_q_func_vars = _scope_vars(scope.name)

-        self.loss = ActorCriticLoss(
-            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
-            self.done_mask, config["gamma"], config["n_step"],
-            config["use_huber"], config["huber_threshold"])
+        self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)

        if config["l2_reg"] is not None:
            for var in self.p_func_vars:
@@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
        # Hard initial update
        self.update_target(tau=1.0)

+    def _build_q_network(self, obs, actions):
+        return QNetwork(
+            ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
+            self.config["critic_hiddens"],
+            self.config["critic_hidden_activation"]).value
+
+    def _build_p_network(self, obs):
+        return PNetwork(
+            ModelCatalog.get_model(obs, 1, self.config["model"]),
+            self.dim_actions, self.config["actor_hiddens"],
+            self.config["actor_hidden_activation"]).action_scores
+
+    def _build_action_network(self, p_values, stochastic, eps):
+        return ActionNetwork(p_values, self.low_action, self.high_action,
+                             stochastic, eps, self.config["exploration_theta"],
+                             self.config["exploration_sigma"]).actions
+
+    def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0):
+        return ActorCriticLoss(
+            q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
+            self.done_mask, self.config["gamma"], self.config["n_step"],
+            self.config["use_huber"], self.config["huber_threshold"])
+
    def gradients(self, optimizer):
        if self.config["grad_norm_clipping"] is not None:
            actor_grads_and_vars = _minimize_and_clip(
@@ -71,7 +71,6 @@ class QLoss(object):
                 done_mask,
                 gamma=0.99,
                 n_step=1):
-
        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best

        # compute RHS of bellman equation
@@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph):

        self.config = config
        self.cur_epsilon = 1.0
-        num_actions = action_space.n
-
-        def _build_q_network(obs):
-            return QNetwork(
-                ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
-                config["dueling"], config["hiddens"]).value
+        self.num_actions = action_space.n

        # Action inputs
        self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph):

        # Action Q network
        with tf.variable_scope(Q_SCOPE) as scope:
-            q_values = _build_q_network(self.cur_observations)
+            q_values = self._build_q_network(self.cur_observations)
            self.q_func_vars = _scope_vars(scope.name)

        # Action outputs
-        self.output_actions = QValuePolicy(q_values, self.cur_observations,
-                                           num_actions, self.stochastic,
-                                           self.eps).action
+        self.output_actions = self._build_q_value_policy(q_values)

        # Replay inputs
        self.obs_t = tf.placeholder(
@@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph):

        # q network evaluation
        with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_t = _build_q_network(self.obs_t)
+            q_t = self._build_q_network(self.obs_t)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
-            q_tp1 = _build_q_network(self.obs_tp1)
+            q_tp1 = self._build_q_network(self.obs_tp1)
            self.target_q_func_vars = _scope_vars(scope.name)

        # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
-                                     1)
+        q_t_selected = tf.reduce_sum(
+            q_t * tf.one_hot(self.act_t, self.num_actions), 1)

        # compute estimate of best possible value starting from state at t + 1
        if config["double_q"]:
            with tf.variable_scope(Q_SCOPE, reuse=True):
-                q_tp1_using_online_net = _build_q_network(self.obs_tp1)
+                q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
            q_tp1_best = tf.reduce_sum(
-                q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
-                1)
+                q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
+                                   self.num_actions), 1)
        else:
            q_tp1_best = tf.reduce_max(q_tp1, 1)

-        self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
-                          self.rew_t, self.done_mask, config["gamma"],
-                          config["n_step"])
+        self.loss = self._build_q_loss(q_t_selected, q_tp1_best)

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
@@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph):
            loss_inputs=self.loss_inputs)
        self.sess.run(tf.global_variables_initializer())

+    def _build_q_network(self, obs):
+        return QNetwork(
+            ModelCatalog.get_model(obs, 1,
+                                   self.config["model"]), self.num_actions,
+            self.config["dueling"], self.config["hiddens"]).value
+
+    def _build_q_value_policy(self, q_values):
+        return QValuePolicy(q_values, self.cur_observations, self.num_actions,
+                            self.stochastic, self.eps).action
+
+    def _build_q_loss(self, q_t_selected, q_tp1_best):
+        return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
+                     self.rew_t, self.done_mask, self.config["gamma"],
+                     self.config["n_step"])
+
    def optimizer(self):
        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])