[rllib] Add noisy network and distributional Q-learning to implement Rainbow (#2737)

* add noisy network * distributional q-learning in dev * add distributional q-learning * validated rainbow module * add some comments * supply some comments * remove redundant argument to pass CI test * async replay optimizer does NOT need annealing beta * ignore rainbow specific arguments for DDPG and Apex * formatted by yapf * Update dqn_policy_graph.py * Update dqn_policy_graph.py
2026-06-28 19:49:04 +08:00 · 2018-08-25 14:17:14 -07:00
parent 6201a6d1c7
commit 982cde664f
4 changed files with 307 additions and 48 deletions
@@ -17,12 +17,24 @@ from ray.tune.trial import Resources

 OPTIMIZER_SHARED_CONFIGS = [
    "buffer_size", "prioritized_replay", "prioritized_replay_alpha",
-    "prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
-    "train_batch_size", "learning_starts"
+    "prioritized_replay_beta", "schedule_max_timesteps",
+    "beta_annealing_fraction", "final_prioritized_replay_beta",
+    "prioritized_replay_eps", "sample_batch_size", "train_batch_size",
+    "learning_starts"
 ]

 DEFAULT_CONFIG = with_common_config({
    # === Model ===
+    # Number of atoms for representing the distribution of return. When
+    # this is greater than 1, distributional Q-learning is used.
+    # the discrete supports are bounded by v_min and v_max
+    "num_atoms": 1,
+    "v_min": -10.0,
+    "v_max": 10.0,
+    # Whether to use noisy network
+    "noisy": False,
+    # control the initial value of noisy nets
+    "sigma0": 0.5,
    # Whether to use dueling dqn
    "dueling": True,
    # Whether to use double dqn
@@ -59,6 +71,11 @@ DEFAULT_CONFIG = with_common_config({
    "prioritized_replay_alpha": 0.6,
    # Beta parameter for sampling from prioritized replay buffer.
    "prioritized_replay_beta": 0.4,
+    # Fraction of entire training period over which the beta parameter is
+    # annealed
+    "beta_annealing_fraction": 0.2,
+    # Final value of beta
+    "final_prioritized_replay_beta": 0.4,
    # Epsilon to add to the TD errors when updating priorities.
    "prioritized_replay_eps": 1e-6,
    # Whether to LZ4 compress observations
@@ -67,6 +84,8 @@ DEFAULT_CONFIG = with_common_config({
    # === Optimization ===
    # Learning rate for adam optimizer
    "lr": 5e-4,
+    # Adam epsilon hyper parameter
+    "adam_epsilon": 1e-8,
    # If not None, clip gradients during optimization at this value
    "grad_norm_clipping": 40,
    # How many steps of the model to sample before learning starts.
@@ -130,6 +149,12 @@ class DQNAgent(Agent):
        ]

        for k in OPTIMIZER_SHARED_CONFIGS:
+            if self._agent_name != "DQN" and k in [
+                    "schedule_max_timesteps", "beta_annealing_fraction",
+                    "final_prioritized_replay_beta"
+            ]:
+                # only Rainbow needs annealing prioritized_replay_beta
+                continue
            if k not in self.config["optimizer"]:
                self.config["optimizer"][k] = self.config[k]

@@ -149,6 +174,7 @@ class DQNAgent(Agent):
        else:
            # Hack to workaround https://github.com/ray-project/ray/issues/2541
            self.remote_evaluators = None
+
        self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
            self.local_evaluator, self.remote_evaluators,
            self.config["optimizer"])
@@ -18,32 +18,158 @@ Q_TARGET_SCOPE = "target_q_func"


 class QNetwork(object):
-    def __init__(self, model, num_actions, dueling=False, hiddens=[256]):
+    def __init__(self,
+                 model,
+                 num_actions,
+                 dueling=False,
+                 hiddens=[256],
+                 use_noisy=False,
+                 num_atoms=1,
+                 v_min=-10.0,
+                 v_max=10.0,
+                 sigma0=0.5):
        with tf.variable_scope("action_value"):
            action_out = model.last_layer
-            for hidden in hiddens:
-                action_out = layers.fully_connected(
-                    action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
-            action_scores = layers.fully_connected(
-                action_out, num_outputs=num_actions, activation_fn=None)
+            for i in range(len(hiddens)):
+                if use_noisy:
+                    action_out = self.noisy_layer("hidden_%d" % i, action_out,
+                                                  hiddens[i], sigma0)
+                else:
+                    action_out = layers.fully_connected(
+                        action_out,
+                        num_outputs=hiddens[i],
+                        activation_fn=tf.nn.relu)
+            if use_noisy:
+                action_scores = self.noisy_layer(
+                    "output",
+                    action_out,
+                    num_actions * num_atoms,
+                    sigma0,
+                    non_linear=False)
+            else:
+                action_scores = layers.fully_connected(
+                    action_out,
+                    num_outputs=num_actions * num_atoms,
+                    activation_fn=None)
+            if num_atoms > 1:
+                # Distributional Q-learning uses a discrete support z
+                # to represent the action value distribution
+                z = tf.range(num_atoms, dtype=tf.float32)
+                z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
+                support_logits_per_action = tf.reshape(
+                    tensor=action_scores, shape=(-1, num_actions, num_atoms))
+                support_prob_per_action = tf.nn.softmax(
+                    logits=support_logits_per_action)
+                action_scores = tf.reduce_sum(
+                    input_tensor=z * support_prob_per_action, axis=-1)
+                self.logits = support_logits_per_action
+                self.dist = support_prob_per_action
+            else:
+                self.logits = tf.expand_dims(tf.ones_like(action_scores), -1)
+                self.dist = tf.expand_dims(tf.ones_like(action_scores), -1)

        if dueling:
            with tf.variable_scope("state_value"):
                state_out = model.last_layer
-                for hidden in hiddens:
-                    state_out = layers.fully_connected(
+                for i in range(len(hiddens)):
+                    if use_noisy:
+                        state_out = self.noisy_layer("dueling_hidden_%d" % i,
+                                                     state_out, hiddens[i],
+                                                     sigma0)
+                    else:
+                        state_out = layers.fully_connected(
+                            state_out,
+                            num_outputs=hiddens[i],
+                            activation_fn=tf.nn.relu)
+                if use_noisy:
+                    state_score = self.noisy_layer(
+                        "dueling_output",
                        state_out,
-                        num_outputs=hidden,
-                        activation_fn=tf.nn.relu)
-                state_score = layers.fully_connected(
-                    state_out, num_outputs=1, activation_fn=None)
-            action_scores_mean = tf.reduce_mean(action_scores, 1)
-            action_scores_centered = action_scores - tf.expand_dims(
-                action_scores_mean, 1)
-            self.value = state_score + action_scores_centered
+                        num_atoms,
+                        sigma0,
+                        non_linear=False)
+                else:
+                    state_score = layers.fully_connected(
+                        state_out, num_outputs=num_atoms, activation_fn=None)
+            if num_atoms > 1:
+                support_logits_per_action_mean = tf.reduce_mean(
+                    support_logits_per_action, 1)
+                support_logits_per_action_centered = (
+                    support_logits_per_action - tf.expand_dims(
+                        support_logits_per_action_mean, 1))
+                support_logits_per_action = tf.expand_dims(
+                    state_score, 1) + support_logits_per_action_centered
+                support_prob_per_action = tf.nn.softmax(
+                    logits=support_logits_per_action)
+                self.value = tf.reduce_sum(
+                    input_tensor=z * support_prob_per_action, axis=-1)
+                self.logits = support_logits_per_action
+                self.dist = support_prob_per_action
+            else:
+                action_scores_mean = tf.reduce_mean(action_scores, 1)
+                action_scores_centered = action_scores - tf.expand_dims(
+                    action_scores_mean, 1)
+                self.value = state_score + action_scores_centered
        else:
            self.value = action_scores

+    def f_epsilon(self, x):
+        return tf.sign(x) * tf.sqrt(tf.abs(x))
+
+    def noisy_layer(self, prefix, action_in, out_size, sigma0,
+                    non_linear=True):
+        """
+        a common dense layer: y = w^{T}x + b
+        a noisy layer: y = (w + \epsilon_w*\sigma_w)^{T}x +
+            (b+\epsilon_b*\sigma_b)
+        where \epsilon are random variables sampled from factorized normal
+        distributions and \sigma are trainable variables which are expected to
+        vanish along the training procedure
+        """
+        in_size = int(action_in.shape[1])
+
+        epsilon_in = tf.random_normal(shape=[in_size])
+        epsilon_out = tf.random_normal(shape=[out_size])
+        epsilon_in = self.f_epsilon(epsilon_in)
+        epsilon_out = self.f_epsilon(epsilon_out)
+        epsilon_w = tf.matmul(
+            a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
+        epsilon_b = epsilon_out
+        sigma_w = tf.get_variable(
+            name=prefix + "_sigma_w",
+            shape=[in_size, out_size],
+            dtype=tf.float32,
+            initializer=tf.random_uniform_initializer(
+                minval=-1.0 / np.sqrt(float(in_size)),
+                maxval=1.0 / np.sqrt(float(in_size))))
+        # TF noise generation can be unreliable on GPU
+        # If generating the noise on the CPU,
+        # lowering sigma0 to 0.1 may be helpful
+        sigma_b = tf.get_variable(
+            name=prefix + "_sigma_b",
+            shape=[out_size],
+            dtype=tf.float32,  # 0.5~GPU, 0.1~CPU
+            initializer=tf.constant_initializer(
+                sigma0 / np.sqrt(float(in_size))))
+
+        w = tf.get_variable(
+            name=prefix + "_fc_w",
+            shape=[in_size, out_size],
+            dtype=tf.float32,
+            initializer=layers.xavier_initializer())
+        b = tf.get_variable(
+            name=prefix + "_fc_b",
+            shape=[out_size],
+            dtype=tf.float32,
+            initializer=tf.zeros_initializer())
+
+        action_activation = tf.nn.xw_plus_b(action_in, w + sigma_w * epsilon_w,
+                                            b + sigma_b * epsilon_b)
+
+        if not non_linear:
+            return action_activation
+        return tf.nn.relu(action_activation)
+

 class QValuePolicy(object):
    def __init__(self, q_values, observations, num_actions, stochastic, eps):
@@ -65,21 +191,67 @@ class QValuePolicy(object):
 class QLoss(object):
    def __init__(self,
                 q_t_selected,
+                 q_logits_t_selected,
                 q_tp1_best,
+                 q_dist_tp1_best,
                 importance_weights,
                 rewards,
                 done_mask,
                 gamma=0.99,
-                 n_step=1):
-        q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
+                 n_step=1,
+                 num_atoms=1,
+                 v_min=-10.0,
+                 v_max=10.0):

-        # compute RHS of bellman equation
-        q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
+        if num_atoms > 1:
+            # Distributional Q-learning which corresponds to an entropy loss

-        # compute the error (potentially clipped)
-        self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
-        self.loss = tf.reduce_mean(
-            importance_weights * _huber_loss(self.td_error))
+            z = tf.range(num_atoms, dtype=tf.float32)
+            z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
+
+            # (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
+            r_tau = tf.expand_dims(
+                rewards, -1) + gamma**n_step * tf.expand_dims(
+                    1.0 - done_mask, -1) * tf.expand_dims(z, 0)
+            r_tau = tf.clip_by_value(r_tau, v_min, v_max)
+            b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
+            lb = tf.floor(b)
+            ub = tf.ceil(b)
+            # indispensable judgement which is missed in most implementations
+            # when b happens to be an integer, lb == ub, so pr_j(s', a*) will
+            # be discarded because (ub-b) == (b-lb) == 0
+            floor_equal_ceil = tf.to_float(tf.less(ub - lb, 0.5))
+
+            l_project = tf.one_hot(
+                tf.cast(lb, dtype=tf.int32),
+                num_atoms)  # (batch_size, num_atoms, num_atoms)
+            u_project = tf.one_hot(
+                tf.cast(ub, dtype=tf.int32),
+                num_atoms)  # (batch_size, num_atoms, num_atoms)
+            ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil)
+            mu_delta = q_dist_tp1_best * (b - lb)
+            ml_delta = tf.reduce_sum(
+                l_project * tf.expand_dims(ml_delta, -1), axis=1)
+            mu_delta = tf.reduce_sum(
+                u_project * tf.expand_dims(mu_delta, -1), axis=1)
+            m = ml_delta + mu_delta
+
+            # Rainbow paper claims that using this cross entropy loss for
+            # priority is robust and insensitive to `prioritized_replay_alpha`
+            self.td_error = tf.nn.softmax_cross_entropy_with_logits(
+                labels=m, logits=q_logits_t_selected)
+            self.loss = tf.reduce_mean(self.td_error * importance_weights)
+        else:
+            q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
+
+            # compute RHS of bellman equation
+            q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
+
+            # compute the error (potentially clipped)
+            self.td_error = (
+                q_t_selected - tf.stop_gradient(q_t_selected_target))
+            self.loss = tf.reduce_mean(
+                importance_weights * _huber_loss(self.td_error))


 class DQNPolicyGraph(TFPolicyGraph):
@@ -102,7 +274,8 @@ class DQNPolicyGraph(TFPolicyGraph):

        # Action Q network
        with tf.variable_scope(Q_SCOPE) as scope:
-            q_values = self._build_q_network(self.cur_observations)
+            q_values, q_logits, q_dist = self._build_q_network(
+                self.cur_observations)
            self.q_func_vars = _scope_vars(scope.name)

        # Action outputs
@@ -121,29 +294,43 @@ class DQNPolicyGraph(TFPolicyGraph):

        # q network evaluation
        with tf.variable_scope(Q_SCOPE, reuse=True):
-            q_t = self._build_q_network(self.obs_t)
+            q_t, q_logits_t, q_dist_t = self._build_q_network(self.obs_t)

        # target q network evalution
        with tf.variable_scope(Q_TARGET_SCOPE) as scope:
-            q_tp1 = self._build_q_network(self.obs_tp1)
+            q_tp1, q_logits_tp1, q_dist_tp1 = self._build_q_network(
+                self.obs_tp1)
            self.target_q_func_vars = _scope_vars(scope.name)

        # q scores for actions which we know were selected in the given state.
-        q_t_selected = tf.reduce_sum(
-            q_t * tf.one_hot(self.act_t, self.num_actions), 1)
+        one_hot_selection = tf.one_hot(self.act_t, self.num_actions)
+        q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
+        q_logits_t_selected = tf.reduce_sum(
+            q_logits_t * tf.expand_dims(one_hot_selection, -1), 1)

        # compute estimate of best possible value starting from state at t + 1
        if config["double_q"]:
            with tf.variable_scope(Q_SCOPE, reuse=True):
-                q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
+                q_tp1_using_online_net, q_logits_tp1_using_online_net, \
+                    q_dist_tp1_using_online_net = self._build_q_network(
+                        self.obs_tp1)
            q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
-            q_tp1_best = tf.reduce_sum(
-                q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
-                                   self.num_actions), 1)
+            q_tp1_best_one_hot_selection = tf.one_hot(
+                q_tp1_best_using_online_net, self.num_actions)
+            q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
+            q_dist_tp1_best = tf.reduce_sum(
+                q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1),
+                1)
        else:
-            q_tp1_best = tf.reduce_max(q_tp1, 1)
+            q_tp1_best_one_hot_selection = tf.one_hot(
+                tf.argmax(q_tp1, 1), self.num_actions)
+            q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
+            q_dist_tp1_best = tf.reduce_sum(
+                q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1),
+                1)

-        self.loss = self._build_q_loss(q_t_selected, q_tp1_best)
+        self.loss = self._build_q_loss(q_t_selected, q_logits_t_selected,
+                                       q_tp1_best, q_dist_tp1_best)

        # update_target_fn will be called periodically to copy Q network to
        # target Q network
@@ -176,22 +363,29 @@ class DQNPolicyGraph(TFPolicyGraph):
        self.sess.run(tf.global_variables_initializer())

    def _build_q_network(self, obs):
-        return QNetwork(
-            ModelCatalog.get_model(obs, 1,
-                                   self.config["model"]), self.num_actions,
-            self.config["dueling"], self.config["hiddens"]).value
+        qnet = QNetwork(
+            ModelCatalog.get_model(obs, 1, self.config["model"]),
+            self.num_actions, self.config["dueling"], self.config["hiddens"],
+            self.config["noisy"], self.config["num_atoms"],
+            self.config["v_min"], self.config["v_max"], self.config["sigma0"])
+        return qnet.value, qnet.logits, qnet.dist

    def _build_q_value_policy(self, q_values):
        return QValuePolicy(q_values, self.cur_observations, self.num_actions,
                            self.stochastic, self.eps).action

-    def _build_q_loss(self, q_t_selected, q_tp1_best):
-        return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
-                     self.rew_t, self.done_mask, self.config["gamma"],
-                     self.config["n_step"])
+    def _build_q_loss(self, q_t_selected, q_logits_t_selected, q_tp1_best,
+                      q_dist_tp1_best):
+        return QLoss(q_t_selected, q_logits_t_selected, q_tp1_best,
+                     q_dist_tp1_best, self.importance_weights, self.rew_t,
+                     self.done_mask, self.config["gamma"],
+                     self.config["n_step"], self.config["num_atoms"],
+                     self.config["v_min"], self.config["v_max"])

    def optimizer(self):
-        return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
+        return tf.train.AdamOptimizer(
+            learning_rate=self.config["lr"],
+            epsilon=self.config["adam_epsilon"])

    def gradients(self, optimizer):
        if self.config["grad_norm_clipping"] is not None:
@@ -14,6 +14,7 @@ from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
 from ray.rllib.utils.compression import pack_if_needed
 from ray.rllib.utils.filter import RunningStat
 from ray.rllib.utils.timer import TimerStat
+from ray.rllib.utils.schedules import LinearSchedule


 class SyncReplayOptimizer(PolicyOptimizer):
@@ -29,12 +30,20 @@ class SyncReplayOptimizer(PolicyOptimizer):
              prioritized_replay=True,
              prioritized_replay_alpha=0.6,
              prioritized_replay_beta=0.4,
+              schedule_max_timesteps=100000,
+              beta_annealing_fraction=0.2,
+              final_prioritized_replay_beta=0.4,
              prioritized_replay_eps=1e-6,
              train_batch_size=32,
              sample_batch_size=4):

        self.replay_starts = learning_starts
-        self.prioritized_replay_beta = prioritized_replay_beta
+        # linearly annealing beta used in Rainbow paper
+        self.prioritized_replay_beta = LinearSchedule(
+            schedule_timesteps=int(
+                schedule_max_timesteps * beta_annealing_fraction),
+            initial_p=prioritized_replay_beta,
+            final_p=final_prioritized_replay_beta)
        self.prioritized_replay_eps = prioritized_replay_eps
        self.train_batch_size = train_batch_size

@@ -122,7 +131,8 @@ class SyncReplayOptimizer(PolicyOptimizer):
                    (obses_t, actions, rewards, obses_tp1, dones, weights,
                     batch_indexes) = replay_buffer.sample(
                         self.train_batch_size,
-                         beta=self.prioritized_replay_beta)
+                         beta=self.prioritized_replay_beta.value(
+                             self.num_steps_trained))
                else:
                    (obses_t, actions, rewards, obses_tp1,
                     dones) = replay_buffer.sample(self.train_batch_size)
@@ -0,0 +1,29 @@
+pong-deterministic-rainbow:
+    env: PongDeterministic-v4
+    run: DQN
+    stop:
+        episode_reward_mean: 20
+    config:
+        num_atoms: 51
+        noisy: True
+        gamma: 0.99
+        lr: .0001
+        hiddens: [512]
+        learning_starts: 10000
+        buffer_size: 50000
+        sample_batch_size: 4
+        train_batch_size: 32
+        schedule_max_timesteps: 2000000
+        exploration_final_eps: 0.0
+        exploration_fraction: .000001
+        target_network_update_freq: 500
+        prioritized_replay: True
+        prioritized_replay_alpha: 0.5
+        beta_annealing_fraction: 0.2
+        final_prioritized_replay_beta: 1.0
+        n_step: 3
+        gpu: True
+        model:
+          grayscale: True
+          zero_mean: False
+          dim: 42