mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:16:06 +08:00
Exploration with Parameter Space Noise (#4048)
* enable parameter space noise for exploration * enable parameter space noise for exploration * yapf formatted * remove the usage of scipy softmax avialable in the latest version only * enable subclass that has no parameter_noise in the config * run user specified callbacks and test parameter space noise in multi node setting * formatted by yapf * Update dqn.py * lint
This commit is contained in:
@@ -70,6 +70,9 @@ DEFAULT_CONFIG = with_common_config({
|
||||
"target_network_update_freq": 0,
|
||||
# Update the target by \tau * policy + (1-\tau) * target_policy
|
||||
"tau": 0.002,
|
||||
# If True parameter space noise will be used for exploration
|
||||
# See https://blog.openai.com/better-exploration-with-parameter-noise/
|
||||
"parameter_noise": False,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
|
||||
@@ -29,13 +29,20 @@ class PNetwork(object):
|
||||
"""Maps an observations (i.e., state) to an action where each entry takes
|
||||
value from (0, 1) due to the sigmoid function."""
|
||||
|
||||
def __init__(self, model, dim_actions, hiddens=[64, 64],
|
||||
activation="relu"):
|
||||
def __init__(self,
|
||||
model,
|
||||
dim_actions,
|
||||
hiddens=[64, 64],
|
||||
activation="relu",
|
||||
parameter_noise=False):
|
||||
action_out = model.last_layer
|
||||
activation = tf.nn.__dict__[activation]
|
||||
for hidden in hiddens:
|
||||
action_out = layers.fully_connected(
|
||||
action_out, num_outputs=hidden, activation_fn=activation)
|
||||
action_out,
|
||||
num_outputs=hidden,
|
||||
activation_fn=activation,
|
||||
normalizer_fn=layers.layer_norm if parameter_noise else None)
|
||||
# Use sigmoid layer to bound values within (0, 1)
|
||||
# shape of action_scores is [batch_size, dim_actions]
|
||||
self.action_scores = layers.fully_connected(
|
||||
@@ -60,7 +67,8 @@ class ActionNetwork(object):
|
||||
act_noise=0.1,
|
||||
is_target=False,
|
||||
target_noise=0.2,
|
||||
noise_clip=0.5):
|
||||
noise_clip=0.5,
|
||||
parameter_noise=False):
|
||||
|
||||
# shape is [None, dim_action]
|
||||
deterministic_actions = (
|
||||
@@ -97,8 +105,9 @@ class ActionNetwork(object):
|
||||
eps * (high_action - low_action) * exploration_value,
|
||||
low_action, high_action)
|
||||
|
||||
self.actions = tf.cond(stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
self.actions = tf.cond(
|
||||
tf.logical_and(stochastic, not parameter_noise),
|
||||
lambda: stochastic_actions, lambda: deterministic_actions)
|
||||
|
||||
|
||||
class QNetwork(object):
|
||||
@@ -210,6 +219,12 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
self.cur_observations, observation_space)
|
||||
self.p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Noise vars for P network except for layer normalization vars
|
||||
if self.config["parameter_noise"]:
|
||||
self._build_parameter_noise([
|
||||
var for var in self.p_func_vars if "LayerNorm" not in var.name
|
||||
])
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE):
|
||||
self.output_actions = self._build_action_network(
|
||||
@@ -429,6 +444,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
if self.config["parameter_noise"]:
|
||||
# adjust the sigma of parameter space noise
|
||||
states, noisy_actions = [
|
||||
list(x) for x in sample_batch.columns(["obs", "actions"])
|
||||
]
|
||||
self.sess.run(self.remove_noise_op)
|
||||
clean_actions = self.sess.run(
|
||||
self.output_actions,
|
||||
feed_dict={
|
||||
self.cur_observations: states,
|
||||
self.stochastic: False,
|
||||
self.eps: .0
|
||||
})
|
||||
distance_in_action_space = np.sqrt(
|
||||
np.mean(np.square(clean_actions - noisy_actions)))
|
||||
self.pi_distance = distance_in_action_space
|
||||
if distance_in_action_space < self.config["exploration_sigma"]:
|
||||
self.parameter_noise_sigma_val *= 1.01
|
||||
else:
|
||||
self.parameter_noise_sigma_val /= 1.01
|
||||
self.parameter_noise_sigma.load(
|
||||
self.parameter_noise_sigma_val, session=self.sess)
|
||||
|
||||
return _postprocess_dqn(self, sample_batch)
|
||||
|
||||
@override(TFPolicyGraph)
|
||||
@@ -465,7 +503,8 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
"is_training": self._get_is_training_placeholder(),
|
||||
}, obs_space, 1, self.config["model"]), self.dim_actions,
|
||||
self.config["actor_hiddens"],
|
||||
self.config["actor_hidden_activation"])
|
||||
self.config["actor_hidden_activation"],
|
||||
self.config["parameter_noise"])
|
||||
return policy_net.action_scores, policy_net.model
|
||||
|
||||
def _build_action_network(self, p_values, stochastic, eps,
|
||||
@@ -491,6 +530,43 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
self.config["use_huber"], self.config["huber_threshold"],
|
||||
self.config["twin_q"])
|
||||
|
||||
def _build_parameter_noise(self, pnet_params):
|
||||
self.parameter_noise_sigma_val = self.config["exploration_sigma"]
|
||||
self.parameter_noise_sigma = tf.get_variable(
|
||||
initializer=tf.constant_initializer(
|
||||
self.parameter_noise_sigma_val),
|
||||
name="parameter_noise_sigma",
|
||||
shape=(),
|
||||
trainable=False,
|
||||
dtype=tf.float32)
|
||||
self.parameter_noise = list()
|
||||
# No need to add any noise on LayerNorm parameters
|
||||
for var in pnet_params:
|
||||
noise_var = tf.get_variable(
|
||||
name=var.name.split(':')[0] + "_noise",
|
||||
shape=var.shape,
|
||||
initializer=tf.constant_initializer(.0),
|
||||
trainable=False)
|
||||
self.parameter_noise.append(noise_var)
|
||||
remove_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
remove_noise_ops.append(tf.assign_add(var, -var_noise))
|
||||
self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
|
||||
generate_noise_ops = list()
|
||||
for var_noise in self.parameter_noise:
|
||||
generate_noise_ops.append(
|
||||
tf.assign(
|
||||
var_noise,
|
||||
tf.random_normal(
|
||||
shape=var_noise.shape,
|
||||
stddev=self.parameter_noise_sigma)))
|
||||
with tf.control_dependencies(generate_noise_ops):
|
||||
add_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
add_noise_ops.append(tf.assign_add(var, var_noise))
|
||||
self.add_noise_op = tf.group(*tuple(add_noise_ops))
|
||||
self.pi_distance = None
|
||||
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = self.sess.run(
|
||||
@@ -508,6 +584,10 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
def reset_noise(self, sess):
|
||||
sess.run(self.reset_noise_op)
|
||||
|
||||
def add_parameter_noise(self):
|
||||
if self.config["parameter_noise"]:
|
||||
self.sess.run(self.add_noise_op)
|
||||
|
||||
# support both hard and soft sync
|
||||
def update_target(self, tau=None):
|
||||
return self.sess.run(
|
||||
|
||||
@@ -5,6 +5,7 @@ from __future__ import print_function
|
||||
import logging
|
||||
import time
|
||||
|
||||
from ray import tune
|
||||
from ray.rllib import optimizers
|
||||
from ray.rllib.agents.agent import Agent, with_common_config
|
||||
from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
|
||||
@@ -73,6 +74,9 @@ DEFAULT_CONFIG = with_common_config({
|
||||
# Softmax temperature. Q values are divided by this value prior to softmax.
|
||||
# Softmax approaches argmax as the temperature drops to zero.
|
||||
"softmax_temp": 1.0,
|
||||
# If True parameter space noise will be used for exploration
|
||||
# See https://blog.openai.com/better-exploration-with-parameter-noise/
|
||||
"parameter_noise": False,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
@@ -139,6 +143,8 @@ class DQNAgent(Agent):
|
||||
|
||||
@override(Agent)
|
||||
def _init(self):
|
||||
self._validate_config()
|
||||
|
||||
# Update effective batch size to include n-step
|
||||
adjusted_batch_size = max(self.config["sample_batch_size"],
|
||||
self.config.get("n_step", 1))
|
||||
@@ -160,6 +166,41 @@ class DQNAgent(Agent):
|
||||
if k not in self.config["optimizer"]:
|
||||
self.config["optimizer"][k] = self.config[k]
|
||||
|
||||
if self.config.get("parameter_noise", False):
|
||||
if self.config["callbacks"]["on_episode_start"]:
|
||||
start_callback = self.config["callbacks"]["on_episode_start"]
|
||||
else:
|
||||
start_callback = None
|
||||
|
||||
def on_episode_start(info):
|
||||
# as a callback function to sample and pose parameter space
|
||||
# noise on the parameters of network
|
||||
policies = info["policy"]
|
||||
for pi in policies.values():
|
||||
pi.add_parameter_noise()
|
||||
if start_callback:
|
||||
start_callback(info)
|
||||
|
||||
self.config["callbacks"]["on_episode_start"] = tune.function(
|
||||
on_episode_start)
|
||||
if self.config["callbacks"]["on_episode_end"]:
|
||||
end_callback = self.config["callbacks"]["on_episode_end"]
|
||||
else:
|
||||
end_callback = None
|
||||
|
||||
def on_episode_end(info):
|
||||
# as a callback function to monitor the distance
|
||||
# between noisy policy and original policy
|
||||
policies = info["policy"]
|
||||
episode = info["episode"]
|
||||
episode.custom_metrics["policy_distance"] = policies[
|
||||
"default"].pi_distance
|
||||
if end_callback:
|
||||
end_callback(info)
|
||||
|
||||
self.config["callbacks"]["on_episode_end"] = tune.function(
|
||||
on_episode_end)
|
||||
|
||||
self.local_evaluator = self.make_local_evaluator(
|
||||
self.env_creator, self._policy_graph)
|
||||
|
||||
@@ -296,3 +337,14 @@ class DQNAgent(Agent):
|
||||
Agent.__setstate__(self, state)
|
||||
self.num_target_updates = state["num_target_updates"]
|
||||
self.last_target_update_ts = state["last_target_update_ts"]
|
||||
|
||||
def _validate_config(self):
|
||||
if self.config.get("parameter_noise", False):
|
||||
if self.config["batch_mode"] != "complete_episodes":
|
||||
raise ValueError(
|
||||
"Exploration with parameter space noise requires "
|
||||
"batch_mode to be complete_episodes.")
|
||||
if self.config.get("noisy", False):
|
||||
raise ValueError(
|
||||
"Exploration with parameter space noise and noisy network "
|
||||
"cannot be used at the same time.")
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import print_function
|
||||
|
||||
from gym.spaces import Discrete
|
||||
import numpy as np
|
||||
from scipy.stats import entropy
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
@@ -28,7 +29,8 @@ class QNetwork(object):
|
||||
num_atoms=1,
|
||||
v_min=-10.0,
|
||||
v_max=10.0,
|
||||
sigma0=0.5):
|
||||
sigma0=0.5,
|
||||
parameter_noise=False):
|
||||
self.model = model
|
||||
with tf.variable_scope("action_value"):
|
||||
if hiddens:
|
||||
@@ -41,7 +43,9 @@ class QNetwork(object):
|
||||
action_out = layers.fully_connected(
|
||||
action_out,
|
||||
num_outputs=hiddens[i],
|
||||
activation_fn=tf.nn.relu)
|
||||
activation_fn=tf.nn.relu,
|
||||
normalizer_fn=layers.layer_norm
|
||||
if parameter_noise else None)
|
||||
else:
|
||||
# Avoid postprocessing the outputs. This enables custom models
|
||||
# to be used for parametric action DQN.
|
||||
@@ -89,7 +93,9 @@ class QNetwork(object):
|
||||
state_out = layers.fully_connected(
|
||||
state_out,
|
||||
num_outputs=hiddens[i],
|
||||
activation_fn=tf.nn.relu)
|
||||
activation_fn=tf.nn.relu,
|
||||
normalizer_fn=layers.layer_norm
|
||||
if parameter_noise else None)
|
||||
if use_noisy:
|
||||
state_score = self.noisy_layer(
|
||||
"dueling_output",
|
||||
@@ -310,6 +316,13 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
self.q_values = q_values
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Noise vars for Q network except for layer normalization vars
|
||||
if self.config["parameter_noise"]:
|
||||
self._build_parameter_noise([
|
||||
var for var in self.q_func_vars if "LayerNorm" not in var.name
|
||||
])
|
||||
self.action_probs = tf.nn.softmax(self.q_values)
|
||||
|
||||
# Action outputs
|
||||
self.output_actions, self.action_prob = self._build_q_value_policy(
|
||||
q_values)
|
||||
@@ -448,6 +461,28 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
if self.config["parameter_noise"]:
|
||||
# adjust the sigma of parameter space noise
|
||||
states = [list(x) for x in sample_batch.columns(["obs"])][0]
|
||||
|
||||
noisy_action_distribution = self.sess.run(
|
||||
self.action_probs, feed_dict={self.cur_observations: states})
|
||||
self.sess.run(self.remove_noise_op)
|
||||
clean_action_distribution = self.sess.run(
|
||||
self.action_probs, feed_dict={self.cur_observations: states})
|
||||
distance_in_action_space = np.mean(
|
||||
entropy(clean_action_distribution.T,
|
||||
noisy_action_distribution.T))
|
||||
self.pi_distance = distance_in_action_space
|
||||
if (distance_in_action_space <
|
||||
-np.log(1 - self.cur_epsilon +
|
||||
self.cur_epsilon / self.num_actions)):
|
||||
self.parameter_noise_sigma_val *= 1.01
|
||||
else:
|
||||
self.parameter_noise_sigma_val /= 1.01
|
||||
self.parameter_noise_sigma.load(
|
||||
self.parameter_noise_sigma_val, session=self.sess)
|
||||
|
||||
return _postprocess_dqn(self, sample_batch)
|
||||
|
||||
@override(PolicyGraph)
|
||||
@@ -459,6 +494,43 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
TFPolicyGraph.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
|
||||
def _build_parameter_noise(self, pnet_params):
|
||||
self.parameter_noise_sigma_val = 1.0
|
||||
self.parameter_noise_sigma = tf.get_variable(
|
||||
initializer=tf.constant_initializer(
|
||||
self.parameter_noise_sigma_val),
|
||||
name="parameter_noise_sigma",
|
||||
shape=(),
|
||||
trainable=False,
|
||||
dtype=tf.float32)
|
||||
self.parameter_noise = list()
|
||||
# No need to add any noise on LayerNorm parameters
|
||||
for var in pnet_params:
|
||||
noise_var = tf.get_variable(
|
||||
name=var.name.split(':')[0] + "_noise",
|
||||
shape=var.shape,
|
||||
initializer=tf.constant_initializer(.0),
|
||||
trainable=False)
|
||||
self.parameter_noise.append(noise_var)
|
||||
remove_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
remove_noise_ops.append(tf.assign_add(var, -var_noise))
|
||||
self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
|
||||
generate_noise_ops = list()
|
||||
for var_noise in self.parameter_noise:
|
||||
generate_noise_ops.append(
|
||||
tf.assign(
|
||||
var_noise,
|
||||
tf.random_normal(
|
||||
shape=var_noise.shape,
|
||||
stddev=self.parameter_noise_sigma)))
|
||||
with tf.control_dependencies(generate_noise_ops):
|
||||
add_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
add_noise_ops.append(tf.assign_add(var, var_noise))
|
||||
self.add_noise_op = tf.group(*tuple(add_noise_ops))
|
||||
self.pi_distance = None
|
||||
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = self.sess.run(
|
||||
@@ -473,6 +545,10 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
})
|
||||
return td_err
|
||||
|
||||
def add_parameter_noise(self):
|
||||
if self.config["parameter_noise"]:
|
||||
self.sess.run(self.add_noise_op)
|
||||
|
||||
def update_target(self):
|
||||
return self.sess.run(self.update_target_expr)
|
||||
|
||||
@@ -487,7 +563,8 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
}, space, self.num_actions, self.config["model"]),
|
||||
self.num_actions, self.config["dueling"], self.config["hiddens"],
|
||||
self.config["noisy"], self.config["num_atoms"],
|
||||
self.config["v_min"], self.config["v_max"], self.config["sigma0"])
|
||||
self.config["v_min"], self.config["v_max"], self.config["sigma0"],
|
||||
self.config["parameter_noise"])
|
||||
return qnet.value, qnet.logits, qnet.dist, qnet.model
|
||||
|
||||
def _build_q_value_policy(self, q_values):
|
||||
|
||||
@@ -264,7 +264,8 @@ def _env_runner(base_env,
|
||||
if callbacks.get("on_episode_start"):
|
||||
callbacks["on_episode_start"]({
|
||||
"env": base_env,
|
||||
"episode": episode
|
||||
"policy": policies,
|
||||
"episode": episode,
|
||||
})
|
||||
return episode
|
||||
|
||||
@@ -412,6 +413,7 @@ def _process_observations(base_env, policies, batch_builder_pool,
|
||||
if callbacks.get("on_episode_end"):
|
||||
callbacks["on_episode_end"]({
|
||||
"env": base_env,
|
||||
"policy": policies,
|
||||
"episode": episode
|
||||
})
|
||||
del active_episodes[env_id]
|
||||
|
||||
Reference in New Issue
Block a user