mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 03:18:59 +08:00
[rllib] dqn/ddpg policy customization (#2445)
* dqn policy update - more customization * docs for custom DQN graph * Update rllib-training.rst * Update rllib-models.rst * Update rllib.rst * Update rllib-training.rst * Update rllib-concepts.rst * yapf codestyle
This commit is contained in:
committed by
Eric Liang
parent
68660453e4
commit
05490b8cb9
@@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
dim_actions = action_space.shape[0]
|
||||
low_action = action_space.low
|
||||
high_action = action_space.high
|
||||
self.dim_actions = action_space.shape[0]
|
||||
self.low_action = action_space.low
|
||||
self.high_action = action_space.high
|
||||
self.actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["actor_lr"])
|
||||
self.critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["critic_lr"])
|
||||
|
||||
def _build_q_network(obs, actions):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), actions,
|
||||
config["critic_hiddens"],
|
||||
config["critic_hidden_activation"]).value
|
||||
|
||||
def _build_p_network(obs):
|
||||
return PNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
|
||||
config["actor_hiddens"],
|
||||
config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_action_network(p_values, stochastic, eps):
|
||||
return ActionNetwork(p_values, low_action, high_action, stochastic,
|
||||
eps, config["exploration_theta"],
|
||||
config["exploration_sigma"]).actions
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
@@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# Actor: P (policy) network
|
||||
with tf.variable_scope(P_SCOPE) as scope:
|
||||
p_values = _build_p_network(self.cur_observations)
|
||||
p_values = self._build_p_network(self.cur_observations)
|
||||
self.p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE):
|
||||
self.output_actions = _build_action_network(
|
||||
self.output_actions = self._build_action_network(
|
||||
p_values, self.stochastic, self.eps)
|
||||
|
||||
with tf.variable_scope(A_SCOPE, reuse=True):
|
||||
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
|
||||
self.reset_noise_op = tf.assign(exploration_sample,
|
||||
dim_actions * [.0])
|
||||
self.dim_actions * [.0])
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
@@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# p network evaluation
|
||||
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
|
||||
self.p_t = _build_p_network(self.obs_t)
|
||||
self.p_t = self._build_p_network(self.obs_t)
|
||||
|
||||
# target p network evaluation
|
||||
with tf.variable_scope(P_TARGET_SCOPE) as scope:
|
||||
p_tp1 = _build_p_network(self.obs_tp1)
|
||||
p_tp1 = self._build_p_network(self.obs_tp1)
|
||||
target_p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE, reuse=True):
|
||||
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
|
||||
zero_eps = tf.constant(value=.0, dtype=tf.float32)
|
||||
output_actions = _build_action_network(
|
||||
output_actions = self._build_action_network(
|
||||
self.p_t, deterministic_flag, zero_eps)
|
||||
|
||||
output_actions_estimated = _build_action_network(
|
||||
output_actions_estimated = self._build_action_network(
|
||||
p_tp1, deterministic_flag, zero_eps)
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_t = _build_q_network(self.obs_t, self.act_t)
|
||||
q_t = self._build_q_network(self.obs_t, self.act_t)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp0 = _build_q_network(self.obs_t, output_actions)
|
||||
q_tp0 = self._build_q_network(self.obs_t, output_actions)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
|
||||
q_tp1 = self._build_q_network(self.obs_tp1,
|
||||
output_actions_estimated)
|
||||
target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
self.loss = ActorCriticLoss(
|
||||
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
|
||||
self.done_mask, config["gamma"], config["n_step"],
|
||||
config["use_huber"], config["huber_threshold"])
|
||||
self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
|
||||
|
||||
if config["l2_reg"] is not None:
|
||||
for var in self.p_func_vars:
|
||||
@@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
def _build_q_network(self, obs, actions):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
|
||||
self.config["critic_hiddens"],
|
||||
self.config["critic_hidden_activation"]).value
|
||||
|
||||
def _build_p_network(self, obs):
|
||||
return PNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]),
|
||||
self.dim_actions, self.config["actor_hiddens"],
|
||||
self.config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_action_network(self, p_values, stochastic, eps):
|
||||
return ActionNetwork(p_values, self.low_action, self.high_action,
|
||||
stochastic, eps, self.config["exploration_theta"],
|
||||
self.config["exploration_sigma"]).actions
|
||||
|
||||
def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0):
|
||||
return ActorCriticLoss(
|
||||
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
|
||||
self.done_mask, self.config["gamma"], self.config["n_step"],
|
||||
self.config["use_huber"], self.config["huber_threshold"])
|
||||
|
||||
def gradients(self, optimizer):
|
||||
if self.config["grad_norm_clipping"] is not None:
|
||||
actor_grads_and_vars = _minimize_and_clip(
|
||||
|
||||
@@ -71,7 +71,6 @@ class QLoss(object):
|
||||
done_mask,
|
||||
gamma=0.99,
|
||||
n_step=1):
|
||||
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
@@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
num_actions = action_space.n
|
||||
|
||||
def _build_q_network(obs):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
|
||||
config["dueling"], config["hiddens"]).value
|
||||
self.num_actions = action_space.n
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
@@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# Action Q network
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_values = _build_q_network(self.cur_observations)
|
||||
q_values = self._build_q_network(self.cur_observations)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
self.output_actions = QValuePolicy(q_values, self.cur_observations,
|
||||
num_actions, self.stochastic,
|
||||
self.eps).action
|
||||
self.output_actions = self._build_q_value_policy(q_values)
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
@@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_t = _build_q_network(self.obs_t)
|
||||
q_t = self._build_q_network(self.obs_t)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = _build_q_network(self.obs_tp1)
|
||||
q_tp1 = self._build_q_network(self.obs_tp1)
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
|
||||
1)
|
||||
q_t_selected = tf.reduce_sum(
|
||||
q_t * tf.one_hot(self.act_t, self.num_actions), 1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if config["double_q"]:
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
|
||||
q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best = tf.reduce_sum(
|
||||
q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
|
||||
1)
|
||||
q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
|
||||
self.num_actions), 1)
|
||||
else:
|
||||
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
||||
|
||||
self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
|
||||
self.rew_t, self.done_mask, config["gamma"],
|
||||
config["n_step"])
|
||||
self.loss = self._build_q_loss(q_t_selected, q_tp1_best)
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
@@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
loss_inputs=self.loss_inputs)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def _build_q_network(self, obs):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1,
|
||||
self.config["model"]), self.num_actions,
|
||||
self.config["dueling"], self.config["hiddens"]).value
|
||||
|
||||
def _build_q_value_policy(self, q_values):
|
||||
return QValuePolicy(q_values, self.cur_observations, self.num_actions,
|
||||
self.stochastic, self.eps).action
|
||||
|
||||
def _build_q_loss(self, q_t_selected, q_tp1_best):
|
||||
return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
|
||||
self.rew_t, self.done_mask, self.config["gamma"],
|
||||
self.config["n_step"])
|
||||
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
|
||||
|
||||
|
||||
Reference in New Issue
Block a user