From 16f7ca45e40364e76e5887a751baba38b64c9c3c Mon Sep 17 00:00:00 2001 From: Michael Luo Date: Fri, 18 Jan 2019 13:40:26 -0800 Subject: [PATCH] Appo (#3779) * Deleted old fork, updated new ray and moved PPO-impala to APPO in ppo folder * Deleted unneccesary vtrace.py file * Update pong-impala.yaml * Cleaned PPO Code * Update pong-impala.yaml * Update pong-impala.yaml * wip * new ifle * refactor * add vtrace off option * revert * support any space * docs * fix comment * remove kl * Update cartpole-appo-vtrace.yaml --- doc/source/rllib-algorithms.rst | 18 + doc/source/rllib-env.rst | 2 +- doc/source/rllib.rst | 2 + python/ray/rllib/agents/agent.py | 12 +- python/ray/rllib/agents/impala/impala.py | 12 +- .../agents/impala/vtrace_policy_graph.py | 2 +- python/ray/rllib/agents/ppo/__init__.py | 3 +- python/ray/rllib/agents/ppo/appo.py | 65 +++ .../ray/rllib/agents/ppo/appo_policy_graph.py | 423 ++++++++++++++++++ python/ray/rllib/agents/registry.py | 6 + .../ray/rllib/test/test_supported_spaces.py | 1 + .../ray/rllib/tuned_examples/pong-appo.yaml | 22 + .../cartpole-appo-vtrace.yaml | 13 + .../regression_tests/cartpole-appo.yaml | 13 + 14 files changed, 584 insertions(+), 10 deletions(-) create mode 100644 python/ray/rllib/agents/ppo/appo.py create mode 100644 python/ray/rllib/agents/ppo/appo_policy_graph.py create mode 100644 python/ray/rllib/tuned_examples/pong-appo.yaml create mode 100644 python/ray/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace.yaml create mode 100644 python/ray/rllib/tuned_examples/regression_tests/cartpole-appo.yaml diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index e11fdabb6..6ee5aef50 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -88,6 +88,24 @@ SpaceInvaders 843 ~300 :start-after: __sphinx_doc_begin__ :end-before: __sphinx_doc_end__ +Asynchronous Proximal Policy Optimization (APPO) +------------------------------------------------ + +`[paper] `__ +`[implementation] `__ +We include an asynchronous variant of Proximal Policy Optimization (PPO) based on the IMPALA architecture. This is similar to IMPALA but using a surrogate policy loss with clipping. Compared to synchronous PPO, APPO is more efficient in wall-clock time due to its use of asynchronous sampling. Using a clipped loss also allows for multiple SGD passes, and therefore the potential for better sample efficiency compared to IMPALA. V-trace can also be enabled to correct for off-policy samples. + +This implementation is currently *experimental*. Consider also using `PPO `__ or `IMPALA `__. + +Tuned examples: `PongNoFrameskip-v4 `__ + +**APPO-specific configs** (see also `common configs `__): + +.. literalinclude:: ../../python/ray/rllib/agents/ppo/appo.py + :language: python + :start-after: __sphinx_doc_begin__ + :end-before: __sphinx_doc_end__ + Gradient-based ~~~~~~~~~~~~~~ diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index b1a2b2185..dcdf9f399 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -11,7 +11,7 @@ RLlib works with several different types of environments, including `OpenAI Gym Algorithm Discrete Actions Continuous Actions Multi-Agent Recurrent Policies ============= ======================= ================== =========== ================== A2C, A3C **Yes** `+parametric`_ **Yes** **Yes** **Yes** -PPO **Yes** `+parametric`_ **Yes** **Yes** **Yes** +PPO, APPO **Yes** `+parametric`_ **Yes** **Yes** **Yes** PG **Yes** `+parametric`_ **Yes** **Yes** **Yes** IMPALA **Yes** `+parametric`_ No **Yes** **Yes** DQN, Rainbow **Yes** `+parametric`_ No **Yes** No diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index b771f14ef..4304321dd 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -50,6 +50,8 @@ Algorithms - `Importance Weighted Actor-Learner Architecture (IMPALA) `__ + - `Asynchronous Proximal Policy Optimization (APPO) `__ + * Gradient-based - `Advantage Actor-Critic (A2C, A3C) `__ diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index 282c7736d..4ba9ffda2 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -185,7 +185,13 @@ COMMON_CONFIG = { def with_common_config(extra_config): """Returns the given config dict merged with common agent confs.""" - config = copy.deepcopy(COMMON_CONFIG) + return with_base_config(COMMON_CONFIG, extra_config) + + +def with_base_config(base_config, extra_config): + """Returns the given config dict merged with a base agent conf.""" + + config = copy.deepcopy(base_config) config.update(extra_config) return config @@ -491,8 +497,8 @@ class Agent(Trainable): @classmethod def resource_help(cls, config): return ("\n\nYou can adjust the resource requests of RLlib agents by " - "setting `num_workers` and other configs. See the " - "DEFAULT_CONFIG defined by each agent for more info.\n\n" + "setting `num_workers`, `num_gpus`, and other configs. See " + "the DEFAULT_CONFIG defined by each agent for more info.\n\n" "The config of this agent is: {}".format(config)) @staticmethod diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py index bab04f482..9221e3764 100644 --- a/python/ray/rllib/agents/impala/impala.py +++ b/python/ray/rllib/agents/impala/impala.py @@ -100,10 +100,7 @@ class ImpalaAgent(Agent): for k in OPTIMIZER_SHARED_CONFIGS: if k not in self.config["optimizer"]: self.config["optimizer"][k] = self.config[k] - if self.config["vtrace"]: - policy_cls = self._policy_graph - else: - policy_cls = A3CPolicyGraph + policy_cls = self._get_policy_graph() self.local_evaluator = self.make_local_evaluator( self.env_creator, policy_cls) self.remote_evaluators = self.make_remote_evaluators( @@ -124,3 +121,10 @@ class ImpalaAgent(Agent): result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result + + def _get_policy_graph(self): + if self.config["vtrace"]: + policy_cls = self._policy_graph + else: + policy_cls = A3CPolicyGraph + return policy_cls diff --git a/python/ray/rllib/agents/impala/vtrace_policy_graph.py b/python/ray/rllib/agents/impala/vtrace_policy_graph.py index d18dade5c..127c3f9c5 100644 --- a/python/ray/rllib/agents/impala/vtrace_policy_graph.py +++ b/python/ray/rllib/agents/impala/vtrace_policy_graph.py @@ -1,6 +1,6 @@ """Adapted from A3CPolicyGraph to add V-trace. -Keep in sync with changes to A3CPolicyGraph.""" +Keep in sync with changes to A3CPolicyGraph and VtraceSurrogatePolicyGraph.""" from __future__ import absolute_import from __future__ import division diff --git a/python/ray/rllib/agents/ppo/__init__.py b/python/ray/rllib/agents/ppo/__init__.py index e4d0c7cf0..8e25f9b8f 100644 --- a/python/ray/rllib/agents/ppo/__init__.py +++ b/python/ray/rllib/agents/ppo/__init__.py @@ -1,3 +1,4 @@ from ray.rllib.agents.ppo.ppo import (PPOAgent, DEFAULT_CONFIG) +from ray.rllib.agents.ppo.appo import APPOAgent -__all__ = ["PPOAgent", "DEFAULT_CONFIG"] +__all__ = ["APPOAgent", "PPOAgent", "DEFAULT_CONFIG"] diff --git a/python/ray/rllib/agents/ppo/appo.py b/python/ray/rllib/agents/ppo/appo.py new file mode 100644 index 000000000..5f7534ee7 --- /dev/null +++ b/python/ray/rllib/agents/ppo/appo.py @@ -0,0 +1,65 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.rllib.agents.ppo.appo_policy_graph import AsyncPPOPolicyGraph +from ray.rllib.agents.agent import with_base_config +from ray.rllib.agents import impala +from ray.rllib.utils.annotations import override + +# yapf: disable +# __sphinx_doc_begin__ +DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, { + # Whether to use V-trace weighted advantages. If false, PPO GAE advantages + # will be used instead. + "vtrace": False, + + # == These two options only apply if vtrace: False == + # If true, use the Generalized Advantage Estimator (GAE) + # with a value function, see https://arxiv.org/pdf/1506.02438.pdf. + "use_gae": True, + # GAE(lambda) parameter + "lambda": 1.0, + + # == PPO surrogate loss options == + "clip_param": 0.4, + "kl_coeff": 0.2, + "kl_target": 0.01, + + # == IMPALA optimizer params (see documentation in impala.py) == + "sample_batch_size": 50, + "train_batch_size": 500, + "min_iter_time_s": 10, + "num_workers": 2, + "num_gpus": 1, + "num_data_loader_buffers": 1, + "minibatch_buffer_size": 1, + "num_sgd_iter": 1, + "replay_proportion": 0.0, + "replay_buffer_num_slots": 100, + "max_sample_requests_in_flight_per_worker": 2, + "broadcast_interval": 1, + "grad_clip": 40.0, + "opt_type": "adam", + "lr": 0.0005, + "lr_schedule": None, + "decay": 0.99, + "momentum": 0.0, + "epsilon": 0.1, + "vf_loss_coeff": 0.5, + "entropy_coeff": -0.01, +}) +# __sphinx_doc_end__ +# yapf: enable + + +class APPOAgent(impala.ImpalaAgent): + """PPO surrogate loss with IMPALA-architecture.""" + + _agent_name = "APPO" + _default_config = DEFAULT_CONFIG + _policy_graph = AsyncPPOPolicyGraph + + @override(impala.ImpalaAgent) + def _get_policy_graph(self): + return AsyncPPOPolicyGraph diff --git a/python/ray/rllib/agents/ppo/appo_policy_graph.py b/python/ray/rllib/agents/ppo/appo_policy_graph.py new file mode 100644 index 000000000..f5533f137 --- /dev/null +++ b/python/ray/rllib/agents/ppo/appo_policy_graph.py @@ -0,0 +1,423 @@ +"""Adapted from VTracePolicyGraph to use the PPO surrogate loss. + +Keep in sync with changes to VTracePolicyGraph.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf +import logging +import gym + +import ray +from ray.rllib.agents.impala import vtrace +from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph, \ + LearningRateSchedule +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.utils.error import UnsupportedSpaceException +from ray.rllib.utils.explained_variance import explained_variance +from ray.rllib.models.action_dist import Categorical +from ray.rllib.evaluation.postprocessing import compute_advantages + +logger = logging.getLogger(__name__) + + +class PPOSurrogateLoss(object): + """Loss used when V-trace is disabled. + + Arguments: + prev_actions_logp: A float32 tensor of shape [T, B]. + actions_logp: A float32 tensor of shape [T, B]. + actions_kl: A float32 tensor of shape [T, B]. + actions_entropy: A float32 tensor of shape [T, B]. + values: A float32 tensor of shape [T, B]. + valid_mask: A bool tensor of valid RNN input elements (#2992). + advantages: A float32 tensor of shape [T, B]. + value_targets: A float32 tensor of shape [T, B]. + """ + + def __init__(self, + prev_actions_logp, + actions_logp, + action_kl, + actions_entropy, + values, + valid_mask, + advantages, + value_targets, + vf_loss_coeff=0.5, + entropy_coeff=-0.01, + clip_param=0.3): + + logp_ratio = tf.exp(actions_logp - prev_actions_logp) + + surrogate_loss = tf.minimum( + advantages * logp_ratio, + advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, + 1 + clip_param)) + + self.mean_kl = tf.reduce_mean(action_kl) + self.pi_loss = -tf.reduce_sum(surrogate_loss) + + # The baseline loss + delta = tf.boolean_mask(values - value_targets, valid_mask) + self.value_targets = value_targets + self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) + + # The entropy loss + self.entropy = tf.reduce_sum( + tf.boolean_mask(actions_entropy, valid_mask)) + + # The summed weighted loss + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + + self.entropy * entropy_coeff) + + +class VTraceSurrogateLoss(object): + def __init__(self, + actions, + prev_actions_logp, + actions_logp, + action_kl, + actions_entropy, + dones, + behaviour_logits, + target_logits, + discount, + rewards, + values, + bootstrap_value, + valid_mask, + vf_loss_coeff=0.5, + entropy_coeff=-0.01, + clip_rho_threshold=1.0, + clip_pg_rho_threshold=1.0, + clip_param=0.3): + """PPO surrogate loss with vtrace importance weighting. + + VTraceLoss takes tensors of shape [T, B, ...], where `B` is the + batch_size. The reason we need to know `B` is for V-trace to properly + handle episode cut boundaries. + + Arguments: + actions: An int32 tensor of shape [T, B, NUM_ACTIONS]. + prev_actions_logp: A float32 tensor of shape [T, B]. + actions_logp: A float32 tensor of shape [T, B]. + actions_kl: A float32 tensor of shape [T, B]. + actions_entropy: A float32 tensor of shape [T, B]. + dones: A bool tensor of shape [T, B]. + behaviour_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. + target_logits: A float32 tensor of shape [T, B, NUM_ACTIONS]. + discount: A float32 scalar. + rewards: A float32 tensor of shape [T, B]. + values: A float32 tensor of shape [T, B]. + bootstrap_value: A float32 tensor of shape [B]. + valid_mask: A bool tensor of valid RNN input elements (#2992). + """ + + # Compute vtrace on the CPU for better perf. + with tf.device("/cpu:0"): + self.vtrace_returns = vtrace.from_logits( + behaviour_policy_logits=behaviour_logits, + target_policy_logits=target_logits, + actions=tf.cast(actions, tf.int32), + discounts=tf.to_float(~dones) * discount, + rewards=rewards, + values=values, + bootstrap_value=bootstrap_value, + clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32), + clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold, + tf.float32)) + + logp_ratio = tf.exp(actions_logp - prev_actions_logp) + + advantages = self.vtrace_returns.pg_advantages + surrogate_loss = tf.minimum( + advantages * logp_ratio, + advantages * tf.clip_by_value(logp_ratio, 1 - clip_param, + 1 + clip_param)) + + self.mean_kl = tf.reduce_mean(action_kl) + self.pi_loss = -tf.reduce_sum(surrogate_loss) + + # The baseline loss + delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask) + self.value_targets = self.vtrace_returns.vs + self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) + + # The entropy loss + self.entropy = tf.reduce_sum( + tf.boolean_mask(actions_entropy, valid_mask)) + + # The summed weighted loss + self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff + + self.entropy * entropy_coeff) + + +class AsyncPPOPolicyGraph(LearningRateSchedule, TFPolicyGraph): + def __init__(self, + observation_space, + action_space, + config, + existing_inputs=None): + config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) + assert config["batch_mode"] == "truncate_episodes", \ + "Must use `truncate_episodes` batch mode with V-trace." + self.config = config + self.sess = tf.get_default_session() + + # Policy network model + dist_class, logit_dim = ModelCatalog.get_action_dist( + action_space, self.config["model"]) + + # Create input placeholders + if existing_inputs: + if self.config["vtrace"]: + actions, dones, behaviour_logits, rewards, observations, \ + prev_actions, prev_rewards = existing_inputs[:7] + existing_state_in = existing_inputs[7:-1] + existing_seq_lens = existing_inputs[-1] + else: + actions, dones, behaviour_logits, rewards, observations, \ + prev_actions, prev_rewards, adv_ph, value_targets = \ + existing_inputs[:9] + existing_state_in = existing_inputs[9:-1] + existing_seq_lens = existing_inputs[-1] + else: + actions = ModelCatalog.get_action_placeholder(action_space) + if (not isinstance(action_space, gym.spaces.Discrete) + and self.config["vtrace"]): + raise UnsupportedSpaceException( + "Action space {} is not supported with vtrace.".format( + action_space)) + dones = tf.placeholder(tf.bool, [None], name="dones") + rewards = tf.placeholder(tf.float32, [None], name="rewards") + behaviour_logits = tf.placeholder( + tf.float32, [None, logit_dim], name="behaviour_logits") + observations = tf.placeholder( + tf.float32, [None] + list(observation_space.shape)) + existing_state_in = None + existing_seq_lens = None + if not self.config["vtrace"]: + adv_ph = tf.placeholder( + tf.float32, name="advantages", shape=(None, )) + value_targets = tf.placeholder( + tf.float32, name="value_targets", shape=(None, )) + self.observations = observations + + # Setup the policy + prev_actions = ModelCatalog.get_action_placeholder(action_space) + prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") + self.model = ModelCatalog.get_model( + { + "obs": observations, + "prev_actions": prev_actions, + "prev_rewards": prev_rewards, + }, + observation_space, + logit_dim, + self.config["model"], + state_in=existing_state_in, + seq_lens=existing_seq_lens) + + action_dist = dist_class(self.model.outputs) + prev_action_dist = dist_class(behaviour_logits) + + values = self.model.value_function() + self.value_function = values + self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, + tf.get_variable_scope().name) + + def to_batches(tensor): + if self.config["model"]["use_lstm"]: + B = tf.shape(self.model.seq_lens)[0] + T = tf.shape(tensor)[0] // B + else: + # Important: chop the tensor into batches at known episode cut + # boundaries. TODO(ekl) this is kind of a hack + T = self.config["sample_batch_size"] + B = tf.shape(tensor)[0] // T + rs = tf.reshape(tensor, + tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) + # swap B and T axes + return tf.transpose( + rs, + [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) + + if self.model.state_in: + max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 + mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) + mask = tf.reshape(mask, [-1]) + else: + mask = tf.ones_like(rewards) + + # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. + if self.config["vtrace"]: + logger.info("Using V-Trace surrogate loss (vtrace=True)") + self.loss = VTraceSurrogateLoss( + actions=to_batches(actions)[:-1], + prev_actions_logp=to_batches( + prev_action_dist.logp(actions))[:-1], + actions_logp=to_batches(action_dist.logp(actions))[:-1], + action_kl=prev_action_dist.kl(action_dist), + actions_entropy=to_batches(action_dist.entropy())[:-1], + dones=to_batches(dones)[:-1], + behaviour_logits=to_batches(behaviour_logits)[:-1], + target_logits=to_batches(self.model.outputs)[:-1], + discount=config["gamma"], + rewards=to_batches(rewards)[:-1], + values=to_batches(values)[:-1], + bootstrap_value=to_batches(values)[-1], + valid_mask=to_batches(mask)[:-1], + vf_loss_coeff=self.config["vf_loss_coeff"], + entropy_coeff=self.config["entropy_coeff"], + clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], + clip_pg_rho_threshold=self.config[ + "vtrace_clip_pg_rho_threshold"], + clip_param=self.config["clip_param"]) + else: + logger.info("Using PPO surrogate loss (vtrace=False)") + self.loss = PPOSurrogateLoss( + prev_actions_logp=to_batches(prev_action_dist.logp(actions)), + actions_logp=to_batches(action_dist.logp(actions)), + action_kl=prev_action_dist.kl(action_dist), + actions_entropy=to_batches(action_dist.entropy()), + values=to_batches(values), + valid_mask=to_batches(mask), + advantages=to_batches(adv_ph), + value_targets=to_batches(value_targets), + vf_loss_coeff=self.config["vf_loss_coeff"], + entropy_coeff=self.config["entropy_coeff"], + clip_param=self.config["clip_param"]) + + # KL divergence between worker and learner logits for debugging + model_dist = Categorical(self.model.outputs) + behaviour_dist = Categorical(behaviour_logits) + self.KLs = model_dist.kl(behaviour_dist) + self.mean_KL = tf.reduce_mean(self.KLs) + self.max_KL = tf.reduce_max(self.KLs) + self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) + # Initialize TFPolicyGraph + loss_in = [ + ("actions", actions), + ("dones", dones), + ("behaviour_logits", behaviour_logits), + ("rewards", rewards), + ("obs", observations), + ("prev_actions", prev_actions), + ("prev_rewards", prev_rewards), + ] + if not self.config["vtrace"]: + loss_in.append(("advantages", adv_ph)) + loss_in.append(("value_targets", value_targets)) + LearningRateSchedule.__init__(self, self.config["lr"], + self.config["lr_schedule"]) + TFPolicyGraph.__init__( + self, + observation_space, + action_space, + self.sess, + obs_input=observations, + action_sampler=action_dist.sample(), + loss=self.model.loss() + self.loss.total_loss, + loss_inputs=loss_in, + state_inputs=self.model.state_in, + state_outputs=self.model.state_out, + prev_action_input=prev_actions, + prev_reward_input=prev_rewards, + seq_lens=self.model.seq_lens, + max_seq_len=self.config["model"]["max_seq_len"], + batch_divisibility_req=self.config["sample_batch_size"]) + + self.sess.run(tf.global_variables_initializer()) + + if self.config["vtrace"]: + values_batched = to_batches(values)[:-1] + else: + values_batched = to_batches(values) + self.stats_fetches = { + "stats": { + "model_loss": self.model.loss(), + "cur_lr": tf.cast(self.cur_lr, tf.float64), + "policy_loss": self.loss.pi_loss, + "entropy": self.loss.entropy, + "grad_gnorm": tf.global_norm(self._grads), + "var_gnorm": tf.global_norm(self.var_list), + "vf_loss": self.loss.vf_loss, + "vf_explained_var": explained_variance( + tf.reshape(self.loss.value_targets, [-1]), + tf.reshape(values_batched, [-1])), + "mean_KL": self.mean_KL, + "max_KL": self.max_KL, + "median_KL": self.median_KL, + }, + } + self.stats_fetches["kl"] = self.loss.mean_kl + + def optimizer(self): + if self.config["opt_type"] == "adam": + return tf.train.AdamOptimizer(self.cur_lr) + else: + return tf.train.RMSPropOptimizer(self.cur_lr, self.config["decay"], + self.config["momentum"], + self.config["epsilon"]) + + def gradients(self, optimizer): + grads = tf.gradients(self.loss.total_loss, self.var_list) + self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"]) + clipped_grads = list(zip(self.grads, self.var_list)) + return clipped_grads + + def extra_compute_action_fetches(self): + out = {"behaviour_logits": self.model.outputs} + if not self.config["vtrace"]: + out["vf_preds"] = self.value_function + return out + + def extra_compute_grad_fetches(self): + return self.stats_fetches + + def value(self, ob, *args): + feed_dict = {self.observations: [ob], self.model.seq_lens: [1]} + assert len(args) == len(self.model.state_in), \ + (args, self.model.state_in) + for k, v in zip(self.model.state_in, args): + feed_dict[k] = v + vf = self.sess.run(self.value_function, feed_dict) + return vf[0] + + def postprocess_trajectory(self, + sample_batch, + other_agent_batches=None, + episode=None): + if not self.config["vtrace"]: + completed = sample_batch["dones"][-1] + if completed: + last_r = 0.0 + else: + next_state = [] + for i in range(len(self.model.state_in)): + next_state.append( + [sample_batch["state_out_{}".format(i)][-1]]) + last_r = self.value(sample_batch["new_obs"][-1], *next_state) + batch = compute_advantages( + sample_batch, + last_r, + self.config["gamma"], + self.config["lambda"], + use_gae=self.config["use_gae"]) + else: + batch = sample_batch + del batch.data["new_obs"] # not used, so save some bandwidth + return batch + + def get_initial_state(self): + return self.model.state_init + + def copy(self, existing_inputs): + return AsyncPPOPolicyGraph( + self.observation_space, + self.action_space, + self.config, + existing_inputs=existing_inputs) diff --git a/python/ray/rllib/agents/registry.py b/python/ray/rllib/agents/registry.py index 776e3a526..1ef7cb124 100644 --- a/python/ray/rllib/agents/registry.py +++ b/python/ray/rllib/agents/registry.py @@ -9,6 +9,11 @@ import traceback from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS +def _import_appo(): + from ray.rllib.agents import ppo + return ppo.APPOAgent + + def _import_qmix(): from ray.rllib.agents import qmix return qmix.QMixAgent @@ -93,6 +98,7 @@ ALGORITHMS = { "IMPALA": _import_impala, "QMIX": _import_qmix, "APEX_QMIX": _import_apex_qmix, + "APPO": _import_appo, "MARWIL": _import_marwil, } diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index 03cab83f2..fd2c7d1eb 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -112,6 +112,7 @@ class ModelSupportedSpaces(unittest.TestCase): def testAll(self): stats = {} check_support("IMPALA", {"num_gpus": 0}, stats) + check_support("APPO", {"num_gpus": 0, "vtrace": False}, stats) check_support( "DDPG", { "noise_scale": 100.0, diff --git a/python/ray/rllib/tuned_examples/pong-appo.yaml b/python/ray/rllib/tuned_examples/pong-appo.yaml new file mode 100644 index 000000000..94e48e44d --- /dev/null +++ b/python/ray/rllib/tuned_examples/pong-appo.yaml @@ -0,0 +1,22 @@ +pong-appo: + env: PongNoFrameskip-v4 + run: APPO + stop: + episode_reward_mean: 18.0 + timesteps_total: 5000000 + config: + sample_batch_size: 50 + train_batch_size: 750 + num_workers: 47 + broadcast_interval: 1 + max_sample_requests_in_flight_per_worker: 1 + num_data_loader_buffers: 1 + num_envs_per_worker: 5 + minibatch_buffer_size: 4 + num_sgd_iter: 2 + vf_loss_coeff: 1.0 + clip_param: 0.3 + num_gpus: 1 + grad_clip: 10 + model: + dim: 42 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace.yaml new file mode 100644 index 000000000..be472db66 --- /dev/null +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-appo-vtrace.yaml @@ -0,0 +1,13 @@ +cartpole-appo-vt: + env: CartPole-v0 + run: APPO + stop: + episode_reward_mean: 100 + timesteps_total: 100000 + config: + sample_batch_size: 10 + train_batch_size: 10 + num_envs_per_worker: 5 + num_workers: 1 + num_gpus: 0 + vtrace: true diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-appo.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-appo.yaml new file mode 100644 index 000000000..b817bb6c3 --- /dev/null +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-appo.yaml @@ -0,0 +1,13 @@ +cartpole-appo: + env: CartPole-v0 + run: APPO + stop: + episode_reward_mean: 100 + timesteps_total: 100000 + config: + sample_batch_size: 10 + train_batch_size: 10 + num_envs_per_worker: 5 + num_workers: 1 + num_gpus: 0 + vtrace: false