[rllib] dqn/ddpg policy customization (#2445)

* dqn policy update - more customization

* docs for custom DQN graph

* Update rllib-training.rst

* Update rllib-models.rst

* Update rllib.rst

* Update rllib-training.rst

* Update rllib-concepts.rst

* yapf codestyle
This commit is contained in:
Sergey Kolesnikov
2018-07-23 00:47:14 +03:00
committed by Eric Liang
parent 68660453e4
commit 05490b8cb9
5 changed files with 116 additions and 56 deletions
@@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph):
self.config = config
self.cur_epsilon = 1.0
dim_actions = action_space.shape[0]
low_action = action_space.low
high_action = action_space.high
self.dim_actions = action_space.shape[0]
self.low_action = action_space.low
self.high_action = action_space.high
self.actor_optimizer = tf.train.AdamOptimizer(
learning_rate=config["actor_lr"])
self.critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["critic_lr"])
def _build_q_network(obs, actions):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]), actions,
config["critic_hiddens"],
config["critic_hidden_activation"]).value
def _build_p_network(obs):
return PNetwork(
ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
config["actor_hiddens"],
config["actor_hidden_activation"]).action_scores
def _build_action_network(p_values, stochastic, eps):
return ActionNetwork(p_values, low_action, high_action, stochastic,
eps, config["exploration_theta"],
config["exploration_sigma"]).actions
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
@@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph):
# Actor: P (policy) network
with tf.variable_scope(P_SCOPE) as scope:
p_values = _build_p_network(self.cur_observations)
p_values = self._build_p_network(self.cur_observations)
self.p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE):
self.output_actions = _build_action_network(
self.output_actions = self._build_action_network(
p_values, self.stochastic, self.eps)
with tf.variable_scope(A_SCOPE, reuse=True):
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
self.reset_noise_op = tf.assign(exploration_sample,
dim_actions * [.0])
self.dim_actions * [.0])
# Replay inputs
self.obs_t = tf.placeholder(
@@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph):
# p network evaluation
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
self.p_t = _build_p_network(self.obs_t)
self.p_t = self._build_p_network(self.obs_t)
# target p network evaluation
with tf.variable_scope(P_TARGET_SCOPE) as scope:
p_tp1 = _build_p_network(self.obs_tp1)
p_tp1 = self._build_p_network(self.obs_tp1)
target_p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE, reuse=True):
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
zero_eps = tf.constant(value=.0, dtype=tf.float32)
output_actions = _build_action_network(
output_actions = self._build_action_network(
self.p_t, deterministic_flag, zero_eps)
output_actions_estimated = _build_action_network(
output_actions_estimated = self._build_action_network(
p_tp1, deterministic_flag, zero_eps)
# q network evaluation
with tf.variable_scope(Q_SCOPE) as scope:
q_t = _build_q_network(self.obs_t, self.act_t)
q_t = self._build_q_network(self.obs_t, self.act_t)
self.q_func_vars = _scope_vars(scope.name)
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp0 = _build_q_network(self.obs_t, output_actions)
q_tp0 = self._build_q_network(self.obs_t, output_actions)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
q_tp1 = self._build_q_network(self.obs_tp1,
output_actions_estimated)
target_q_func_vars = _scope_vars(scope.name)
self.loss = ActorCriticLoss(
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
self.done_mask, config["gamma"], config["n_step"],
config["use_huber"], config["huber_threshold"])
self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0)
if config["l2_reg"] is not None:
for var in self.p_func_vars:
@@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph):
# Hard initial update
self.update_target(tau=1.0)
def _build_q_network(self, obs, actions):
return QNetwork(
ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
self.config["critic_hiddens"],
self.config["critic_hidden_activation"]).value
def _build_p_network(self, obs):
return PNetwork(
ModelCatalog.get_model(obs, 1, self.config["model"]),
self.dim_actions, self.config["actor_hiddens"],
self.config["actor_hidden_activation"]).action_scores
def _build_action_network(self, p_values, stochastic, eps):
return ActionNetwork(p_values, self.low_action, self.high_action,
stochastic, eps, self.config["exploration_theta"],
self.config["exploration_sigma"]).actions
def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0):
return ActorCriticLoss(
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
self.done_mask, self.config["gamma"], self.config["n_step"],
self.config["use_huber"], self.config["huber_threshold"])
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = _minimize_and_clip(
+26 -21
View File
@@ -71,7 +71,6 @@ class QLoss(object):
done_mask,
gamma=0.99,
n_step=1):
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
@@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph):
self.config = config
self.cur_epsilon = 1.0
num_actions = action_space.n
def _build_q_network(obs):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
config["dueling"], config["hiddens"]).value
self.num_actions = action_space.n
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph):
# Action Q network
with tf.variable_scope(Q_SCOPE) as scope:
q_values = _build_q_network(self.cur_observations)
q_values = self._build_q_network(self.cur_observations)
self.q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = QValuePolicy(q_values, self.cur_observations,
num_actions, self.stochastic,
self.eps).action
self.output_actions = self._build_q_value_policy(q_values)
# Replay inputs
self.obs_t = tf.placeholder(
@@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph):
# q network evaluation
with tf.variable_scope(Q_SCOPE, reuse=True):
q_t = _build_q_network(self.obs_t)
q_t = self._build_q_network(self.obs_t)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(self.obs_tp1)
q_tp1 = self._build_q_network(self.obs_tp1)
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
1)
q_t_selected = tf.reduce_sum(
q_t * tf.one_hot(self.act_t, self.num_actions), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
q_tp1_using_online_net = self._build_q_network(self.obs_tp1)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
1)
q_tp1 * tf.one_hot(q_tp1_best_using_online_net,
self.num_actions), 1)
else:
q_tp1_best = tf.reduce_max(q_tp1, 1)
self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, config["gamma"],
config["n_step"])
self.loss = self._build_q_loss(q_t_selected, q_tp1_best)
# update_target_fn will be called periodically to copy Q network to
# target Q network
@@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph):
loss_inputs=self.loss_inputs)
self.sess.run(tf.global_variables_initializer())
def _build_q_network(self, obs):
return QNetwork(
ModelCatalog.get_model(obs, 1,
self.config["model"]), self.num_actions,
self.config["dueling"], self.config["hiddens"]).value
def _build_q_value_policy(self, q_values):
return QValuePolicy(q_values, self.cur_observations, self.num_actions,
self.stochastic, self.eps).action
def _build_q_loss(self, q_t_selected, q_tp1_best):
return QLoss(q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, self.config["gamma"],
self.config["n_step"])
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])