From 3ac8fd7ee8f91d52d698fb8d1bbbf2b4f8f23dad Mon Sep 17 00:00:00 2001 From: Jones Wong Date: Thu, 21 Feb 2019 14:35:18 +0800 Subject: [PATCH] Exploration with Parameter Space Noise (#4048) * enable parameter space noise for exploration * enable parameter space noise for exploration * yapf formatted * remove the usage of scipy softmax avialable in the latest version only * enable subclass that has no parameter_noise in the config * run user specified callbacks and test parameter space noise in multi node setting * formatted by yapf * Update dqn.py * lint --- python/ray/rllib/agents/ddpg/ddpg.py | 3 + .../rllib/agents/ddpg/ddpg_policy_graph.py | 94 +++++++++++++++++-- python/ray/rllib/agents/dqn/dqn.py | 52 ++++++++++ .../ray/rllib/agents/dqn/dqn_policy_graph.py | 85 ++++++++++++++++- python/ray/rllib/evaluation/sampler.py | 4 +- test/jenkins_tests/run_multi_node_tests.sh | 8 ++ 6 files changed, 234 insertions(+), 12 deletions(-) diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py index e2cb92ab0..9420e0f15 100644 --- a/python/ray/rllib/agents/ddpg/ddpg.py +++ b/python/ray/rllib/agents/ddpg/ddpg.py @@ -70,6 +70,9 @@ DEFAULT_CONFIG = with_common_config({ "target_network_update_freq": 0, # Update the target by \tau * policy + (1-\tau) * target_policy "tau": 0.002, + # If True parameter space noise will be used for exploration + # See https://blog.openai.com/better-exploration-with-parameter-noise/ + "parameter_noise": False, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index 24276d6b7..3032bbb3f 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -29,13 +29,20 @@ class PNetwork(object): """Maps an observations (i.e., state) to an action where each entry takes value from (0, 1) due to the sigmoid function.""" - def __init__(self, model, dim_actions, hiddens=[64, 64], - activation="relu"): + def __init__(self, + model, + dim_actions, + hiddens=[64, 64], + activation="relu", + parameter_noise=False): action_out = model.last_layer activation = tf.nn.__dict__[activation] for hidden in hiddens: action_out = layers.fully_connected( - action_out, num_outputs=hidden, activation_fn=activation) + action_out, + num_outputs=hidden, + activation_fn=activation, + normalizer_fn=layers.layer_norm if parameter_noise else None) # Use sigmoid layer to bound values within (0, 1) # shape of action_scores is [batch_size, dim_actions] self.action_scores = layers.fully_connected( @@ -60,7 +67,8 @@ class ActionNetwork(object): act_noise=0.1, is_target=False, target_noise=0.2, - noise_clip=0.5): + noise_clip=0.5, + parameter_noise=False): # shape is [None, dim_action] deterministic_actions = ( @@ -97,8 +105,9 @@ class ActionNetwork(object): eps * (high_action - low_action) * exploration_value, low_action, high_action) - self.actions = tf.cond(stochastic, lambda: stochastic_actions, - lambda: deterministic_actions) + self.actions = tf.cond( + tf.logical_and(stochastic, not parameter_noise), + lambda: stochastic_actions, lambda: deterministic_actions) class QNetwork(object): @@ -210,6 +219,12 @@ class DDPGPolicyGraph(TFPolicyGraph): self.cur_observations, observation_space) self.p_func_vars = _scope_vars(scope.name) + # Noise vars for P network except for layer normalization vars + if self.config["parameter_noise"]: + self._build_parameter_noise([ + var for var in self.p_func_vars if "LayerNorm" not in var.name + ]) + # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = self._build_action_network( @@ -429,6 +444,29 @@ class DDPGPolicyGraph(TFPolicyGraph): sample_batch, other_agent_batches=None, episode=None): + if self.config["parameter_noise"]: + # adjust the sigma of parameter space noise + states, noisy_actions = [ + list(x) for x in sample_batch.columns(["obs", "actions"]) + ] + self.sess.run(self.remove_noise_op) + clean_actions = self.sess.run( + self.output_actions, + feed_dict={ + self.cur_observations: states, + self.stochastic: False, + self.eps: .0 + }) + distance_in_action_space = np.sqrt( + np.mean(np.square(clean_actions - noisy_actions))) + self.pi_distance = distance_in_action_space + if distance_in_action_space < self.config["exploration_sigma"]: + self.parameter_noise_sigma_val *= 1.01 + else: + self.parameter_noise_sigma_val /= 1.01 + self.parameter_noise_sigma.load( + self.parameter_noise_sigma_val, session=self.sess) + return _postprocess_dqn(self, sample_batch) @override(TFPolicyGraph) @@ -465,7 +503,8 @@ class DDPGPolicyGraph(TFPolicyGraph): "is_training": self._get_is_training_placeholder(), }, obs_space, 1, self.config["model"]), self.dim_actions, self.config["actor_hiddens"], - self.config["actor_hidden_activation"]) + self.config["actor_hidden_activation"], + self.config["parameter_noise"]) return policy_net.action_scores, policy_net.model def _build_action_network(self, p_values, stochastic, eps, @@ -491,6 +530,43 @@ class DDPGPolicyGraph(TFPolicyGraph): self.config["use_huber"], self.config["huber_threshold"], self.config["twin_q"]) + def _build_parameter_noise(self, pnet_params): + self.parameter_noise_sigma_val = self.config["exploration_sigma"] + self.parameter_noise_sigma = tf.get_variable( + initializer=tf.constant_initializer( + self.parameter_noise_sigma_val), + name="parameter_noise_sigma", + shape=(), + trainable=False, + dtype=tf.float32) + self.parameter_noise = list() + # No need to add any noise on LayerNorm parameters + for var in pnet_params: + noise_var = tf.get_variable( + name=var.name.split(':')[0] + "_noise", + shape=var.shape, + initializer=tf.constant_initializer(.0), + trainable=False) + self.parameter_noise.append(noise_var) + remove_noise_ops = list() + for var, var_noise in zip(pnet_params, self.parameter_noise): + remove_noise_ops.append(tf.assign_add(var, -var_noise)) + self.remove_noise_op = tf.group(*tuple(remove_noise_ops)) + generate_noise_ops = list() + for var_noise in self.parameter_noise: + generate_noise_ops.append( + tf.assign( + var_noise, + tf.random_normal( + shape=var_noise.shape, + stddev=self.parameter_noise_sigma))) + with tf.control_dependencies(generate_noise_ops): + add_noise_ops = list() + for var, var_noise in zip(pnet_params, self.parameter_noise): + add_noise_ops.append(tf.assign_add(var, var_noise)) + self.add_noise_op = tf.group(*tuple(add_noise_ops)) + self.pi_distance = None + def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): td_err = self.sess.run( @@ -508,6 +584,10 @@ class DDPGPolicyGraph(TFPolicyGraph): def reset_noise(self, sess): sess.run(self.reset_noise_op) + def add_parameter_noise(self): + if self.config["parameter_noise"]: + self.sess.run(self.add_noise_op) + # support both hard and soft sync def update_target(self, tau=None): return self.sess.run( diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py index d36f72034..31f7f12cc 100644 --- a/python/ray/rllib/agents/dqn/dqn.py +++ b/python/ray/rllib/agents/dqn/dqn.py @@ -5,6 +5,7 @@ from __future__ import print_function import logging import time +from ray import tune from ray.rllib import optimizers from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph @@ -73,6 +74,9 @@ DEFAULT_CONFIG = with_common_config({ # Softmax temperature. Q values are divided by this value prior to softmax. # Softmax approaches argmax as the temperature drops to zero. "softmax_temp": 1.0, + # If True parameter space noise will be used for exploration + # See https://blog.openai.com/better-exploration-with-parameter-noise/ + "parameter_noise": False, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then @@ -139,6 +143,8 @@ class DQNAgent(Agent): @override(Agent) def _init(self): + self._validate_config() + # Update effective batch size to include n-step adjusted_batch_size = max(self.config["sample_batch_size"], self.config.get("n_step", 1)) @@ -160,6 +166,41 @@ class DQNAgent(Agent): if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] + if self.config.get("parameter_noise", False): + if self.config["callbacks"]["on_episode_start"]: + start_callback = self.config["callbacks"]["on_episode_start"] + else: + start_callback = None + + def on_episode_start(info): + # as a callback function to sample and pose parameter space + # noise on the parameters of network + policies = info["policy"] + for pi in policies.values(): + pi.add_parameter_noise() + if start_callback: + start_callback(info) + + self.config["callbacks"]["on_episode_start"] = tune.function( + on_episode_start) + if self.config["callbacks"]["on_episode_end"]: + end_callback = self.config["callbacks"]["on_episode_end"] + else: + end_callback = None + + def on_episode_end(info): + # as a callback function to monitor the distance + # between noisy policy and original policy + policies = info["policy"] + episode = info["episode"] + episode.custom_metrics["policy_distance"] = policies[ + "default"].pi_distance + if end_callback: + end_callback(info) + + self.config["callbacks"]["on_episode_end"] = tune.function( + on_episode_end) + self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) @@ -296,3 +337,14 @@ class DQNAgent(Agent): Agent.__setstate__(self, state) self.num_target_updates = state["num_target_updates"] self.last_target_update_ts = state["last_target_update_ts"] + + def _validate_config(self): + if self.config.get("parameter_noise", False): + if self.config["batch_mode"] != "complete_episodes": + raise ValueError( + "Exploration with parameter space noise requires " + "batch_mode to be complete_episodes.") + if self.config.get("noisy", False): + raise ValueError( + "Exploration with parameter space noise and noisy network " + "cannot be used at the same time.") diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py index 57f629f7a..561bd8be3 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py +++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py @@ -4,6 +4,7 @@ from __future__ import print_function from gym.spaces import Discrete import numpy as np +from scipy.stats import entropy import tensorflow as tf import tensorflow.contrib.layers as layers @@ -28,7 +29,8 @@ class QNetwork(object): num_atoms=1, v_min=-10.0, v_max=10.0, - sigma0=0.5): + sigma0=0.5, + parameter_noise=False): self.model = model with tf.variable_scope("action_value"): if hiddens: @@ -41,7 +43,9 @@ class QNetwork(object): action_out = layers.fully_connected( action_out, num_outputs=hiddens[i], - activation_fn=tf.nn.relu) + activation_fn=tf.nn.relu, + normalizer_fn=layers.layer_norm + if parameter_noise else None) else: # Avoid postprocessing the outputs. This enables custom models # to be used for parametric action DQN. @@ -89,7 +93,9 @@ class QNetwork(object): state_out = layers.fully_connected( state_out, num_outputs=hiddens[i], - activation_fn=tf.nn.relu) + activation_fn=tf.nn.relu, + normalizer_fn=layers.layer_norm + if parameter_noise else None) if use_noisy: state_score = self.noisy_layer( "dueling_output", @@ -310,6 +316,13 @@ class DQNPolicyGraph(TFPolicyGraph): self.q_values = q_values self.q_func_vars = _scope_vars(scope.name) + # Noise vars for Q network except for layer normalization vars + if self.config["parameter_noise"]: + self._build_parameter_noise([ + var for var in self.q_func_vars if "LayerNorm" not in var.name + ]) + self.action_probs = tf.nn.softmax(self.q_values) + # Action outputs self.output_actions, self.action_prob = self._build_q_value_policy( q_values) @@ -448,6 +461,28 @@ class DQNPolicyGraph(TFPolicyGraph): sample_batch, other_agent_batches=None, episode=None): + if self.config["parameter_noise"]: + # adjust the sigma of parameter space noise + states = [list(x) for x in sample_batch.columns(["obs"])][0] + + noisy_action_distribution = self.sess.run( + self.action_probs, feed_dict={self.cur_observations: states}) + self.sess.run(self.remove_noise_op) + clean_action_distribution = self.sess.run( + self.action_probs, feed_dict={self.cur_observations: states}) + distance_in_action_space = np.mean( + entropy(clean_action_distribution.T, + noisy_action_distribution.T)) + self.pi_distance = distance_in_action_space + if (distance_in_action_space < + -np.log(1 - self.cur_epsilon + + self.cur_epsilon / self.num_actions)): + self.parameter_noise_sigma_val *= 1.01 + else: + self.parameter_noise_sigma_val /= 1.01 + self.parameter_noise_sigma.load( + self.parameter_noise_sigma_val, session=self.sess) + return _postprocess_dqn(self, sample_batch) @override(PolicyGraph) @@ -459,6 +494,43 @@ class DQNPolicyGraph(TFPolicyGraph): TFPolicyGraph.set_state(self, state[0]) self.set_epsilon(state[1]) + def _build_parameter_noise(self, pnet_params): + self.parameter_noise_sigma_val = 1.0 + self.parameter_noise_sigma = tf.get_variable( + initializer=tf.constant_initializer( + self.parameter_noise_sigma_val), + name="parameter_noise_sigma", + shape=(), + trainable=False, + dtype=tf.float32) + self.parameter_noise = list() + # No need to add any noise on LayerNorm parameters + for var in pnet_params: + noise_var = tf.get_variable( + name=var.name.split(':')[0] + "_noise", + shape=var.shape, + initializer=tf.constant_initializer(.0), + trainable=False) + self.parameter_noise.append(noise_var) + remove_noise_ops = list() + for var, var_noise in zip(pnet_params, self.parameter_noise): + remove_noise_ops.append(tf.assign_add(var, -var_noise)) + self.remove_noise_op = tf.group(*tuple(remove_noise_ops)) + generate_noise_ops = list() + for var_noise in self.parameter_noise: + generate_noise_ops.append( + tf.assign( + var_noise, + tf.random_normal( + shape=var_noise.shape, + stddev=self.parameter_noise_sigma))) + with tf.control_dependencies(generate_noise_ops): + add_noise_ops = list() + for var, var_noise in zip(pnet_params, self.parameter_noise): + add_noise_ops.append(tf.assign_add(var, var_noise)) + self.add_noise_op = tf.group(*tuple(add_noise_ops)) + self.pi_distance = None + def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): td_err = self.sess.run( @@ -473,6 +545,10 @@ class DQNPolicyGraph(TFPolicyGraph): }) return td_err + def add_parameter_noise(self): + if self.config["parameter_noise"]: + self.sess.run(self.add_noise_op) + def update_target(self): return self.sess.run(self.update_target_expr) @@ -487,7 +563,8 @@ class DQNPolicyGraph(TFPolicyGraph): }, space, self.num_actions, self.config["model"]), self.num_actions, self.config["dueling"], self.config["hiddens"], self.config["noisy"], self.config["num_atoms"], - self.config["v_min"], self.config["v_max"], self.config["sigma0"]) + self.config["v_min"], self.config["v_max"], self.config["sigma0"], + self.config["parameter_noise"]) return qnet.value, qnet.logits, qnet.dist, qnet.model def _build_q_value_policy(self, q_values): diff --git a/python/ray/rllib/evaluation/sampler.py b/python/ray/rllib/evaluation/sampler.py index 5a40fe8a5..5ead9c7f8 100644 --- a/python/ray/rllib/evaluation/sampler.py +++ b/python/ray/rllib/evaluation/sampler.py @@ -264,7 +264,8 @@ def _env_runner(base_env, if callbacks.get("on_episode_start"): callbacks["on_episode_start"]({ "env": base_env, - "episode": episode + "policy": policies, + "episode": episode, }) return episode @@ -412,6 +413,7 @@ def _process_observations(base_env, policies, batch_builder_pool, if callbacks.get("on_episode_end"): callbacks["on_episode_end"]({ "env": base_env, + "policy": policies, "episode": episode }) del active_episodes[env_id] diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index c75d76e1d..89250bd50 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -259,6 +259,14 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --stop '{"training_iteration": 2}' \ --config '{"num_workers": 2, "optimizer": {"num_replay_buffer_shards": 1}, "learning_starts": 100, "min_iter_time_s": 1}' +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + python /ray/python/ray/rllib/train.py \ + --env Pendulum-v0 \ + --run APEX_DDPG \ + --ray-num-cpus 8 \ + --stop '{"training_iteration": 2}' \ + --config '{"num_workers": 2, "optimizer": {"num_replay_buffer_shards": 1}, "learning_starts": 100, "min_iter_time_s": 1, "batch_mode": "complete_episodes", "parameter_noise": true}' + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ python /ray/python/ray/rllib/train.py \ --env CartPole-v0 \