From 05490b8cb9b321ed12e4e3e7df75545957a8766f Mon Sep 17 00:00:00 2001 From: Sergey Kolesnikov Date: Mon, 23 Jul 2018 00:47:14 +0300 Subject: [PATCH] [rllib] dqn/ddpg policy customization (#2445) * dqn policy update - more customization * docs for custom DQN graph * Update rllib-training.rst * Update rllib-models.rst * Update rllib.rst * Update rllib-training.rst * Update rllib-concepts.rst * yapf codestyle --- doc/source/rllib-concepts.rst | 2 +- doc/source/rllib-models.rst | 50 +++++++++++++ doc/source/rllib.rst | 1 + .../rllib/agents/ddpg/ddpg_policy_graph.py | 72 ++++++++++--------- .../ray/rllib/agents/dqn/dqn_policy_graph.py | 47 ++++++------ 5 files changed, 116 insertions(+), 56 deletions(-) diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index dc8f50688..f752279cb 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -15,7 +15,7 @@ Most interaction with deep learning frameworks is isolated to the `PolicyGraph i Policy Evaluation ----------------- -Given an environment and policy graph, policy evaluation produces `batches `__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator `__ class that manages all of this, and this class is used in most RLlib algorithm. +Given an environment and policy graph, policy evaluation produces `batches `__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `PolicyEvaluator `__ class that manages all of this, and this class is used in most RLlib algorithms. You can also use policy evaluation standalone to produce batches of experiences. This can be done by calling ``ev.sample()`` on an evaluator instance, or ``ev.sample.remote()`` in parallel on evaluator instances created as Ray actors (see ``PolicyEvalutor.as_remote()``). diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 938824ec5..098cfb6ec 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -75,3 +75,53 @@ Similarly, custom preprocessors should subclass the RLlib `preprocessor class `__ * `Custom Models `__ * `Custom Preprocessors `__ +* `Customizing Policy Graphs `__ RLlib Concepts -------------- diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index ceae0d0f0..a6f26885f 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -133,31 +133,14 @@ class DDPGPolicyGraph(TFPolicyGraph): self.config = config self.cur_epsilon = 1.0 - dim_actions = action_space.shape[0] - low_action = action_space.low - high_action = action_space.high + self.dim_actions = action_space.shape[0] + self.low_action = action_space.low + self.high_action = action_space.high self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=config["actor_lr"]) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=config["critic_lr"]) - def _build_q_network(obs, actions): - return QNetwork( - ModelCatalog.get_model(obs, 1, config["model"]), actions, - config["critic_hiddens"], - config["critic_hidden_activation"]).value - - def _build_p_network(obs): - return PNetwork( - ModelCatalog.get_model(obs, 1, config["model"]), dim_actions, - config["actor_hiddens"], - config["actor_hidden_activation"]).action_scores - - def _build_action_network(p_values, stochastic, eps): - return ActionNetwork(p_values, low_action, high_action, stochastic, - eps, config["exploration_theta"], - config["exploration_sigma"]).actions - # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") @@ -166,18 +149,18 @@ class DDPGPolicyGraph(TFPolicyGraph): # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: - p_values = _build_p_network(self.cur_observations) + p_values = self._build_p_network(self.cur_observations) self.p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE): - self.output_actions = _build_action_network( + self.output_actions = self._build_action_network( p_values, self.stochastic, self.eps) with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, - dim_actions * [.0]) + self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder( @@ -195,39 +178,37 @@ class DDPGPolicyGraph(TFPolicyGraph): # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: - self.p_t = _build_p_network(self.obs_t) + self.p_t = self._build_p_network(self.obs_t) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: - p_tp1 = _build_p_network(self.obs_tp1) + p_tp1 = self._build_p_network(self.obs_tp1) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): deterministic_flag = tf.constant(value=False, dtype=tf.bool) zero_eps = tf.constant(value=.0, dtype=tf.float32) - output_actions = _build_action_network( + output_actions = self._build_action_network( self.p_t, deterministic_flag, zero_eps) - output_actions_estimated = _build_action_network( + output_actions_estimated = self._build_action_network( p_tp1, deterministic_flag, zero_eps) # q network evaluation with tf.variable_scope(Q_SCOPE) as scope: - q_t = _build_q_network(self.obs_t, self.act_t) + q_t = self._build_q_network(self.obs_t, self.act_t) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): - q_tp0 = _build_q_network(self.obs_t, output_actions) + q_tp0 = self._build_q_network(self.obs_t, output_actions) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: - q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated) + q_tp1 = self._build_q_network(self.obs_tp1, + output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) - self.loss = ActorCriticLoss( - q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t, - self.done_mask, config["gamma"], config["n_step"], - config["use_huber"], config["huber_threshold"]) + self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: @@ -286,6 +267,29 @@ class DDPGPolicyGraph(TFPolicyGraph): # Hard initial update self.update_target(tau=1.0) + def _build_q_network(self, obs, actions): + return QNetwork( + ModelCatalog.get_model(obs, 1, self.config["model"]), actions, + self.config["critic_hiddens"], + self.config["critic_hidden_activation"]).value + + def _build_p_network(self, obs): + return PNetwork( + ModelCatalog.get_model(obs, 1, self.config["model"]), + self.dim_actions, self.config["actor_hiddens"], + self.config["actor_hidden_activation"]).action_scores + + def _build_action_network(self, p_values, stochastic, eps): + return ActionNetwork(p_values, self.low_action, self.high_action, + stochastic, eps, self.config["exploration_theta"], + self.config["exploration_sigma"]).actions + + def _build_actor_critic_loss(self, q_t, q_tp1, q_tp0): + return ActorCriticLoss( + q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t, + self.done_mask, self.config["gamma"], self.config["n_step"], + self.config["use_huber"], self.config["huber_threshold"]) + def gradients(self, optimizer): if self.config["grad_norm_clipping"] is not None: actor_grads_and_vars = _minimize_and_clip( diff --git a/python/ray/rllib/agents/dqn/dqn_policy_graph.py b/python/ray/rllib/agents/dqn/dqn_policy_graph.py index f553ad325..6dccdede1 100644 --- a/python/ray/rllib/agents/dqn/dqn_policy_graph.py +++ b/python/ray/rllib/agents/dqn/dqn_policy_graph.py @@ -71,7 +71,6 @@ class QLoss(object): done_mask, gamma=0.99, n_step=1): - q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best # compute RHS of bellman equation @@ -93,12 +92,7 @@ class DQNPolicyGraph(TFPolicyGraph): self.config = config self.cur_epsilon = 1.0 - num_actions = action_space.n - - def _build_q_network(obs): - return QNetwork( - ModelCatalog.get_model(obs, 1, config["model"]), num_actions, - config["dueling"], config["hiddens"]).value + self.num_actions = action_space.n # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") @@ -108,13 +102,11 @@ class DQNPolicyGraph(TFPolicyGraph): # Action Q network with tf.variable_scope(Q_SCOPE) as scope: - q_values = _build_q_network(self.cur_observations) + q_values = self._build_q_network(self.cur_observations) self.q_func_vars = _scope_vars(scope.name) # Action outputs - self.output_actions = QValuePolicy(q_values, self.cur_observations, - num_actions, self.stochastic, - self.eps).action + self.output_actions = self._build_q_value_policy(q_values) # Replay inputs self.obs_t = tf.placeholder( @@ -129,31 +121,29 @@ class DQNPolicyGraph(TFPolicyGraph): # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): - q_t = _build_q_network(self.obs_t) + q_t = self._build_q_network(self.obs_t) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: - q_tp1 = _build_q_network(self.obs_tp1) + q_tp1 = self._build_q_network(self.obs_tp1) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. - q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions), - 1) + q_t_selected = tf.reduce_sum( + q_t * tf.one_hot(self.act_t, self.num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): - q_tp1_using_online_net = _build_q_network(self.obs_tp1) + q_tp1_using_online_net = self._build_q_network(self.obs_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( - q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), - 1) + q_tp1 * tf.one_hot(q_tp1_best_using_online_net, + self.num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) - self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights, - self.rew_t, self.done_mask, config["gamma"], - config["n_step"]) + self.loss = self._build_q_loss(q_t_selected, q_tp1_best) # update_target_fn will be called periodically to copy Q network to # target Q network @@ -185,6 +175,21 @@ class DQNPolicyGraph(TFPolicyGraph): loss_inputs=self.loss_inputs) self.sess.run(tf.global_variables_initializer()) + def _build_q_network(self, obs): + return QNetwork( + ModelCatalog.get_model(obs, 1, + self.config["model"]), self.num_actions, + self.config["dueling"], self.config["hiddens"]).value + + def _build_q_value_policy(self, q_values): + return QValuePolicy(q_values, self.cur_observations, self.num_actions, + self.stochastic, self.eps).action + + def _build_q_loss(self, q_t_selected, q_tp1_best): + return QLoss(q_t_selected, q_tp1_best, self.importance_weights, + self.rew_t, self.done_mask, self.config["gamma"], + self.config["n_step"]) + def optimizer(self): return tf.train.AdamOptimizer(learning_rate=self.config["lr"])