From b520f6141ecdd54496b0c26106f3df4442a5f91e Mon Sep 17 00:00:00 2001 From: gehring Date: Fri, 23 Aug 2019 02:21:11 -0400 Subject: [PATCH] [rllib] Adds eager support with a generic `TFEagerPolicy` class (#5436) --- ci/jenkins_tests/run_rllib_tests.sh | 6 +- doc/source/rllib-components.svg | 2 +- doc/source/rllib-concepts.rst | 103 ++--- doc/source/rllib-env.rst | 66 +-- doc/source/rllib-training.rst | 8 +- doc/source/rllib.rst | 31 +- python/ray/tune/util.py | 2 +- rllib/agents/a3c/a3c_tf_policy.py | 63 +-- rllib/agents/a3c/a3c_torch_policy.py | 18 +- rllib/agents/ddpg/ddpg_model.py | 34 +- rllib/agents/ddpg/ddpg_policy.py | 167 ++++---- rllib/agents/ddpg/noop_model.py | 11 +- rllib/agents/dqn/distributional_q_model.py | 46 ++- rllib/agents/dqn/dqn_policy.py | 92 +++-- rllib/agents/dqn/simple_q_model.py | 9 - rllib/agents/dqn/simple_q_policy.py | 69 ++-- rllib/agents/impala/vtrace_policy.py | 85 ++-- rllib/agents/pg/pg_policy.py | 12 +- rllib/agents/pg/torch_pg_policy.py | 14 +- rllib/agents/ppo/appo_policy.py | 100 ++--- rllib/agents/ppo/ppo.py | 9 +- rllib/agents/ppo/ppo_policy.py | 95 +++-- rllib/agents/sac/sac.py | 2 +- rllib/agents/sac/sac_model.py | 6 - rllib/agents/sac/sac_policy.py | 129 ++---- rllib/agents/trainer.py | 12 +- rllib/agents/trainer_template.py | 3 + rllib/evaluation/rollout_worker.py | 18 +- rllib/examples/centralized_critic.py | 40 +- rllib/examples/custom_tf_policy.py | 10 +- rllib/examples/custom_torch_policy.py | 12 +- rllib/examples/eager_execution.py | 11 +- .../rock_paper_scissors_multiagent.py | 13 +- rllib/models/catalog.py | 78 +++- rllib/models/modelv2.py | 29 ++ rllib/models/tf/fcnet_v2.py | 6 +- rllib/models/tf/modelv1_compat.py | 74 ++-- rllib/models/tf/visionnet_v1.py | 2 +- rllib/models/tf/visionnet_v2.py | 110 +++++ rllib/optimizers/multi_gpu_optimizer.py | 4 +- rllib/optimizers/sync_samples_optimizer.py | 41 +- rllib/policy/dynamic_tf_policy.py | 95 +++-- rllib/policy/eager_tf_policy.py | 375 ++++++++++++++++++ rllib/policy/policy.py | 11 +- rllib/policy/sample_batch.py | 3 +- rllib/policy/tf_policy_template.py | 38 +- rllib/policy/torch_policy.py | 26 +- rllib/policy/torch_policy_template.py | 10 +- rllib/tests/test_eager_support.py | 92 +++++ rllib/tests/test_supported_spaces.py | 1 + rllib/train.py | 6 + rllib/utils/tf_ops.py | 62 +++ 52 files changed, 1557 insertions(+), 804 deletions(-) create mode 100644 rllib/models/tf/visionnet_v2.py create mode 100644 rllib/policy/eager_tf_policy.py create mode 100644 rllib/tests/test_eager_support.py diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 9b83b2c9c..107040351 100755 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -1,3 +1,6 @@ +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/rllib/tests/test_eager_support.py + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output /ray/rllib/train.py \ --env PongDeterministic-v0 \ @@ -386,9 +389,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/rllib/examples/multiagent_cartpole.py --num-iters=2 -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output python /ray/rllib/examples/multiagent_cartpole.py --num-iters=2 --simple - docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/rllib/examples/multiagent_two_trainers.py --num-iters=2 diff --git a/doc/source/rllib-components.svg b/doc/source/rllib-components.svg index b9f7bbb11..0d51bc599 100644 --- a/doc/source/rllib-components.svg +++ b/doc/source/rllib-components.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index b58ceba8a..7dbac3c8a 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -120,12 +120,14 @@ To start, you first have to define a loss function. In RLlib, loss functions are import tensorflow as tf from ray.rllib.policy.sample_batch import SampleBatch - def policy_gradient_loss(policy, batch_tensors): - actions = batch_tensors[SampleBatch.ACTIONS] - rewards = batch_tensors[SampleBatch.REWARDS] - return -tf.reduce_mean(policy.action_dist.logp(actions) * rewards) + def policy_gradient_loss(policy, model, dist_class, train_batch): + actions = train_batch[SampleBatch.ACTIONS] + rewards = train_batch[SampleBatch.REWARDS] + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + return -tf.reduce_mean(action_dist.logp(actions) * rewards) -In the above snippet, ``actions`` is a Tensor placeholder of shape ``[batch_size, action_dim...]``, and ``rewards`` is a placeholder of shape ``[batch_size]``. The ``policy.action_dist`` object is an `ActionDistribution `__ that represents the output of the neural network policy model. Passing this loss function to ``build_tf_policy`` is enough to produce a very basic TF policy: +In the above snippet, ``actions`` is a Tensor placeholder of shape ``[batch_size, action_dim...]``, and ``rewards`` is a placeholder of shape ``[batch_size]``. The ``action_dist`` object is an `ActionDistribution `__ that is parameterized by the output of the neural network policy model. Passing this loss function to ``build_tf_policy`` is enough to produce a very basic TF policy: .. code-block:: python @@ -181,10 +183,12 @@ Let's modify our policy loss to include rewards summed over time. To enable this return compute_advantages( sample_batch, 0.0, policy.config["gamma"], use_gae=False) - def policy_gradient_loss(policy, batch_tensors): - actions = batch_tensors[SampleBatch.ACTIONS] - advantages = batch_tensors[Postprocessing.ADVANTAGES] - return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages) + def policy_gradient_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + return -tf.reduce_mean( + action_dist.logp(train_batch[SampleBatch.ACTIONS]) * + train_batch[Postprocessing.ADVANTAGES]) MyTFPolicy = build_tf_policy( name="MyTFPolicy", @@ -193,7 +197,7 @@ Let's modify our policy loss to include rewards summed over time. To enable this The ``postprocess_advantages()`` function above uses calls RLlib's ``compute_advantages`` function to compute advantages for each timestep. If you re-run the trainer with this improved policy, you'll find that it quickly achieves the max reward of 200. -You might be wondering how RLlib makes the advantages placeholder automatically available as ``batch_tensors[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization. You can also access these placeholders via ``policy.get_placeholder()`` after loss initialization. +You might be wondering how RLlib makes the advantages placeholder automatically available as ``train_batch[Postprocessing.ADVANTAGES]``. When building your policy, RLlib will create a "dummy" trajectory batch where all observations, actions, rewards, etc. are zeros. It then calls your ``postprocess_fn``, and generates TF placeholders based on the numpy shapes of the postprocessed batch. RLlib tracks which placeholders that ``loss_fn`` and ``stats_fn`` access, and then feeds the corresponding sample data into those placeholders during loss optimization. You can also access these placeholders via ``policy.get_placeholder()`` after loss initialization. **Example 1: Proximal Policy Optimization** @@ -290,9 +294,9 @@ The ``update_kl`` method on the policy is defined in `PPOTFPolicy `__. However, note that eager and non-eager tensors cannot be mixed within the ``py_function``. Here's an example of embedding eager execution within a policy loss function: +Policies built with ``build_tf_policy`` (most of the reference algorithms are) can be run in eager mode by setting the ``"eager": True`` config option or using ``rllib train --eager``. This will tell RLlib to execute the model forward pass, action distribution, loss, and stats functions in eager mode. -.. code-block:: python +Eager mode makes debugging much easier, since you can now use normal Python functions such as ``print()`` to inspect intermediate tensor values. However, it is slower than graph mode. - def eager_loss(policy, batch_tensors): - """Example of using embedded eager execution in a custom loss. - - Here `compute_penalty` prints the actions and rewards for debugging, and - also computes a (dummy) penalty term to add to the loss. - """ - - def compute_penalty(actions, rewards): - penalty = tf.reduce_mean(tf.cast(actions, tf.float32)) - if random.random() > 0.9: - print("The eagerly computed penalty is", penalty, actions, rewards) - return penalty - - actions = batch_tensors[SampleBatch.ACTIONS] - rewards = batch_tensors[SampleBatch.REWARDS] - penalty = tf.py_function( - compute_penalty, [actions, rewards], Tout=tf.float32) - - return penalty - tf.reduce_mean(policy.action_dist.logp(actions) * rewards) - -You can find a runnable file for the above eager execution example `here `__. +You can also selectively leverage eager operations within graph mode execution with `tf.py_function `__. Here's an example of using eager ops embedded `within a loss function `__. Building Policies in PyTorch ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -452,13 +434,11 @@ Defining a policy in PyTorch is quite similar to that for TensorFlow (and the pr from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.torch_policy_template import build_torch_policy - def policy_gradient_loss(policy, batch_tensors): - logits, _, values, _ = policy.model({ - SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] - }, []) - action_dist = policy.dist_class(logits) - log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS]) - return -batch_tensors[SampleBatch.REWARDS].dot(log_probs) + def policy_gradient_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits) + log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) + return -train_batch[SampleBatch.REWARDS].dot(log_probs) # MyTorchPolicy = build_torch_policy( @@ -480,17 +460,16 @@ Now, building on the TF examples above, let's look at how the `A3C torch policy optimizer_fn=torch_optimizer, mixins=[ValueNetworkMixin]) -``loss_fn``: Similar to the TF example, the actor critic loss is defined over ``batch_tensors``. We imperatively execute the forward pass by calling ``policy.model()`` on the observations followed by ``policy.dist_class()`` on the output logits. The output Tensors are saved as attributes of the policy object (e.g., ``policy.entropy = dist.entropy.mean()``), and we return the scalar loss: +``loss_fn``: Similar to the TF example, the actor critic loss is defined over ``batch``. We imperatively execute the forward pass by calling ``model()`` on the observations followed by ``dist_class()`` on the output logits. The output Tensors are saved as attributes of the policy object (e.g., ``policy.entropy = dist.entropy.mean()``), and we return the scalar loss: .. code-block:: python - def actor_critic_loss(policy, batch_tensors): - logits, _, values, _ = policy.model({ - SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] - }, []) - dist = policy.dist_class(logits) - log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS]) - policy.entropy = dist.entropy().mean() + def actor_critic_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + values = model.value_function() + action_dist = dist_class(logits) + log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) + policy.entropy = action_dist.entropy().mean() ... return overall_err @@ -498,19 +477,19 @@ Now, building on the TF examples above, let's look at how the `A3C torch policy .. code-block:: python - def loss_and_entropy_stats(policy, batch_tensors): + def loss_and_entropy_stats(policy, train_batch): return { "policy_entropy": policy.entropy.item(), "policy_loss": policy.pi_err.item(), "vf_loss": policy.value_err.item(), } -``extra_action_out_fn``: We save value function predictions given model outputs. This makes the value function predictions of the model available in the trajectory as ``batch_tensors[SampleBatch.VF_PREDS]``: +``extra_action_out_fn``: We save value function predictions given model outputs. This makes the value function predictions of the model available in the trajectory as ``batch[SampleBatch.VF_PREDS]``: .. code-block:: python - def model_value_predictions(policy, input_dict, state_batches, model_out): - return {SampleBatch.VF_PREDS: model_out[2].cpu().numpy()} + def model_value_predictions(policy, input_dict, state_batches, model): + return {SampleBatch.VF_PREDS: model.value_function().cpu().numpy()} ``postprocess_fn`` and ``mixins``: Similar to the PPO example, we need access to the value function during postprocessing (i.e., ``add_advantages`` below calls ``policy._value()``. The value function is exposed through a mixin class that defines the method: @@ -537,7 +516,7 @@ Now, building on the TF examples above, let's look at how the `A3C torch policy You can find the full policy definition in `a3c_torch_policy.py `__. -In summary, the main differences between the PyTorch and TensorFlow policy builder functions is that the TF loss and stats functions are built symbolically when the policy is initialized, whereas for PyTorch these functions are called imperatively each time they are used. +In summary, the main differences between the PyTorch and TensorFlow policy builder functions is that the TF loss and stats functions are built symbolically when the policy is initialized, whereas for PyTorch (or TensorFlow Eager) these functions are called imperatively each time they are used. Extending Existing Policies ~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index e310e9e1b..d5f5f6eba 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -231,39 +231,6 @@ The `rock_paper_scissors_multiagent.py top_level ---> - mid_level_0 -------------------------------> mid_level_0 ----------------> mid_level_1 -> - low_level_0 -> low_level_0 -> low_level_0 -> low_level_1 -> low_level_1 -> low_level_2 -> - -This can be implemented as a multi-agent environment with three types of agents. Each higher-level action creates a new lower-level agent instance with a new id (e.g., ``low_level_0``, ``low_level_1``, ``low_level_2`` in the above example). These lower-level agents pop in existence at the start of higher-level steps, and terminate when their higher-level action ends. Their experiences are aggregated by policy, so from RLlib's perspective it's just optimizing three different types of policies. The configuration might look something like this: - -.. code-block:: python - - "multiagent": { - "policies": { - "top_level": (custom_policy or None, ...), - "mid_level": (custom_policy or None, ...), - "low_level": (custom_policy or None, ...), - }, - "policy_mapping_fn": - lambda agent_id: - "low_level" if agent_id.startswith("low_level_") else - "mid_level" if agent_id.startswith("mid_level_") else "top_level" - "policies_to_train": ["top_level"], - }, - - -In this setup, the appropriate rewards for training lower-level agents must be provided by the multi-agent env implementation. The environment class is also responsible for routing between the agents, e.g., conveying `goals `__ from higher-level agents to lower-level agents as part of the lower-level agent observation. - -See this file for a runnable example: `hierarchical_training.py `__. - Variable-Sharing Between Policies ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -323,6 +290,39 @@ It is common to have groups of agents in multi-agent RL. RLlib treats agent grou For environments with multiple groups, or mixtures of agent groups and individual agents, you can use grouping in conjunction with the policy mapping API described in prior sections. +Hierarchical Environments +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hierarchical training can sometimes be implemented as a special case of multi-agent RL. For example, consider a three-level hierarchy of policies, where a top-level policy issues high level actions that are executed at finer timescales by a mid-level and low-level policy. The following timeline shows one step of the top-level policy, which corresponds to two mid-level actions and five low-level actions: + +.. code-block:: text + + top_level ---------------------------------------------------------------> top_level ---> + mid_level_0 -------------------------------> mid_level_0 ----------------> mid_level_1 -> + low_level_0 -> low_level_0 -> low_level_0 -> low_level_1 -> low_level_1 -> low_level_2 -> + +This can be implemented as a multi-agent environment with three types of agents. Each higher-level action creates a new lower-level agent instance with a new id (e.g., ``low_level_0``, ``low_level_1``, ``low_level_2`` in the above example). These lower-level agents pop in existence at the start of higher-level steps, and terminate when their higher-level action ends. Their experiences are aggregated by policy, so from RLlib's perspective it's just optimizing three different types of policies. The configuration might look something like this: + +.. code-block:: python + + "multiagent": { + "policies": { + "top_level": (custom_policy or None, ...), + "mid_level": (custom_policy or None, ...), + "low_level": (custom_policy or None, ...), + }, + "policy_mapping_fn": + lambda agent_id: + "low_level" if agent_id.startswith("low_level_") else + "mid_level" if agent_id.startswith("mid_level_") else "top_level" + "policies_to_train": ["top_level"], + }, + + +In this setup, the appropriate rewards for training lower-level agents must be provided by the multi-agent env implementation. The environment class is also responsible for routing between the agents, e.g., conveying `goals `__ from higher-level agents to lower-level agents as part of the lower-level agent observation. + +See this file for a runnable example: `hierarchical_training.py `__. + Interfacing with External Agents -------------------------------- diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 9052220a8..0d0a359fc 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -14,7 +14,7 @@ You can train a simple DQN trainer with the following command: .. code-block:: bash - rllib train --run DQN --env CartPole-v0 + rllib train --run DQN --env CartPole-v0 # add --eager for eager execution By default, the results will be logged to a subdirectory of ``~/ray_results``. This subdirectory will contain a file ``params.json`` which contains the @@ -122,6 +122,7 @@ Here is an example of the basic usage (for a more complete example, see `custom_ config = ppo.DEFAULT_CONFIG.copy() config["num_gpus"] = 0 config["num_workers"] = 1 + config["eager"] = False trainer = ppo.PPOTrainer(config=config, env="CartPole-v0") # Can optionally call trainer.restore(path) to load a checkpoint. @@ -156,6 +157,7 @@ All RLlib trainers are compatible with the `Tune API `__. This "num_gpus": 0, "num_workers": 1, "lr": tune.grid_search([0.01, 0.001, 0.0001]), + "eager": False, }, ) @@ -370,7 +372,9 @@ The ``"monitor": true`` config can be used to save Gym episode videos to the res TensorFlow Eager ~~~~~~~~~~~~~~~~ -While RLlib uses TF graph mode for all computations, you can still leverage TF eager to inspect the intermediate state of computations using `tf.py_function `__. Here's an example of using eager mode in `a custom RLlib model and loss `__. +Policies built with ``build_tf_policy`` can be also run in eager mode by setting the ``"eager": True`` config option or using ``rllib train --eager``. This will tell RLlib to execute the model forward pass, action distribution, loss, and stats functions in eager mode. + +Eager mode makes debugging much easier, since you can now use normal Python functions such as ``print()`` to inspect intermediate tensor values. However, it is slower than graph mode. Episode Traces ~~~~~~~~~~~~~~ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 373cf50f7..5c30caeb1 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -1,7 +1,7 @@ RLlib: Scalable Reinforcement Learning ====================================== -RLlib is an open-source library for reinforcement learning that offers both high scalability and a unified API for a variety of applications. +RLlib is an open-source library for reinforcement learning that offers both high scalability and a unified API for a variety of applications. RLlib natively supports TensorFlow, TensorFlow Eager, and PyTorch, but most of its internals are framework agnostic. .. image:: rllib-stack.svg @@ -25,13 +25,13 @@ Then, you can try out training in the following equivalent ways: .. code-block:: bash - rllib train --run=PPO --env=CartPole-v0 + rllib train --run=PPO --env=CartPole-v0 # add --eager for eager execution .. code-block:: python from ray import tune from ray.rllib.agents.ppo import PPOTrainer - tune.run(PPOTrainer, config={"env": "CartPole-v0"}) + tune.run(PPOTrainer, config={"env": "CartPole-v0"}) # "eager": True for eager execution Next, we'll cover three key concepts in RLlib: Policies, Samples, and Trainers. @@ -46,10 +46,11 @@ Policies can be implemented using `any framework MyTFPolicy = build_tf_policy( @@ -85,25 +86,13 @@ Policies each define a ``learn_on_batch()`` method that improves the policy give - Simple `Q-function loss `__ - Importance-weighted `APPO surrogate loss `__ -RLlib `Trainer classes `__ coordinate the distributed workflow of running rollouts and optimizing policies. They do this by leveraging `policy optimizers `__ that implement the desired computation pattern (i.e., synchronous or asynchronous sampling, distributed replay, etc): +RLlib `Trainer classes `__ coordinate the distributed workflow of running rollouts and optimizing policies. They do this by leveraging `policy optimizers `__ that implement the desired computation pattern. The following figure shows *synchronous sampling*, the simplest of `these patterns `__: .. figure:: a2c-arch.svg Synchronous Sampling (e.g., A2C, PG, PPO) -.. figure:: dqn-arch.svg - - Synchronous Replay (e.g., DQN, DDPG, TD3) - -.. figure:: impala-arch.svg - - Asynchronous Sampling (e.g., IMPALA, APPO) - -.. figure:: apex-arch.svg - - Asynchronous Replay (e.g., Ape-X) - -RLlib uses `Ray actors `__ to scale these architectures from a single core to many thousands of cores in a cluster. You can `configure the parallelism `__ used for training by changing the ``num_workers`` parameter. +RLlib uses `Ray actors `__ to scale training from a single core to many thousands of cores in a cluster. You can `configure the parallelism `__ used for training by changing the ``num_workers`` parameter. Customization ~~~~~~~~~~~~~ diff --git a/python/ray/tune/util.py b/python/ray/tune/util.py index 19e5ce9c6..d33a74d58 100644 --- a/python/ray/tune/util.py +++ b/python/ray/tune/util.py @@ -138,7 +138,7 @@ class warn_if_slow(object): def __exit__(self, type, value, traceback): now = time.time() - if now - self.start > 0.1 and now - START_OF_TIME > 60.0: + if now - self.start > 0.5 and now - START_OF_TIME > 60.0: logger.warning("The `{}` operation took {} seconds to complete, ". format(self.name, now - self.start) + "which may be a performance bottleneck.") diff --git a/rllib/agents/a3c/a3c_tf_policy.py b/rllib/agents/a3c/a3c_tf_policy.py index 64974b0c7..1fb1ed2ba 100644 --- a/rllib/agents/a3c/a3c_tf_policy.py +++ b/rllib/agents/a3c/a3c_tf_policy.py @@ -11,6 +11,7 @@ from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.policy.tf_policy import LearningRateSchedule +from ray.rllib.utils.tf_ops import make_tf_callable from ray.rllib.utils import try_import_tf tf = try_import_tf() @@ -37,12 +38,15 @@ class A3CLoss(object): self.entropy * entropy_coeff) -def actor_critic_loss(policy, batch_tensors): - policy.loss = A3CLoss( - policy.action_dist, batch_tensors[SampleBatch.ACTIONS], - batch_tensors[Postprocessing.ADVANTAGES], - batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf, - policy.config["vf_loss_coeff"], policy.config["entropy_coeff"]) +def actor_critic_loss(policy, model, dist_class, train_batch): + model_out, _ = model.from_batch(train_batch) + action_dist = dist_class(model_out, model) + policy.loss = A3CLoss(action_dist, train_batch[SampleBatch.ACTIONS], + train_batch[Postprocessing.ADVANTAGES], + train_batch[Postprocessing.VALUE_TARGETS], + model.value_function(), + policy.config["vf_loss_coeff"], + policy.config["entropy_coeff"]) return policy.loss.total_loss @@ -55,7 +59,7 @@ def postprocess_advantages(policy, last_r = 0.0 else: next_state = [] - for i in range(len(policy.state_in)): + for i in range(policy.num_state_tensors()): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1], sample_batch[SampleBatch.ACTIONS][-1], @@ -66,58 +70,57 @@ def postprocess_advantages(policy, def add_value_function_fetch(policy): - return {SampleBatch.VF_PREDS: policy.vf} + return {SampleBatch.VF_PREDS: policy.model.value_function()} class ValueNetworkMixin(object): def __init__(self): - self.vf = self.model.value_function() + @make_tf_callable(self.get_session()) + def value(ob, prev_action, prev_reward, *state): + model_out, _ = self.model({ + SampleBatch.CUR_OBS: tf.convert_to_tensor([ob]), + SampleBatch.PREV_ACTIONS: tf.convert_to_tensor([prev_action]), + SampleBatch.PREV_REWARDS: tf.convert_to_tensor([prev_reward]), + "is_training": tf.convert_to_tensor(False), + }, [tf.convert_to_tensor([s]) for s in state], + tf.convert_to_tensor([1])) + return self.model.value_function()[0] - def _value(self, ob, prev_action, prev_reward, *args): - feed_dict = { - self.get_placeholder(SampleBatch.CUR_OBS): [ob], - self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action], - self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward], - self.seq_lens: [1] - } - assert len(args) == len(self.state_in), \ - (args, self.state_in) - for k, v in zip(self.state_in, args): - feed_dict[k] = v - vf = self.get_session().run(self.vf, feed_dict) - return vf[0] + self._value = value -def stats(policy, batch_tensors): +def stats(policy, train_batch): return { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "policy_entropy": policy.loss.entropy, - "var_gnorm": tf.global_norm([x for x in policy.var_list]), + "var_gnorm": tf.global_norm( + [x for x in policy.model.trainable_variables()]), "vf_loss": policy.loss.vf_loss, } -def grad_stats(policy, batch_tensors, grads): +def grad_stats(policy, train_batch, grads): return { "grad_gnorm": tf.global_norm(grads), "vf_explained_var": explained_variance( - policy.get_placeholder(Postprocessing.VALUE_TARGETS), policy.vf), + train_batch[Postprocessing.VALUE_TARGETS], + policy.model.value_function()), } def clip_gradients(policy, optimizer, loss): - grads = tf.gradients(loss, policy.var_list) + grads_and_vars = optimizer.compute_gradients( + loss, policy.model.trainable_variables()) + grads = [g for (g, v) in grads_and_vars] grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) - clipped_grads = list(zip(grads, policy.var_list)) + clipped_grads = list(zip(grads, policy.model.trainable_variables())) return clipped_grads def setup_mixins(policy, obs_space, action_space, config): ValueNetworkMixin.__init__(policy) LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) - policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - tf.get_variable_scope().name) A3CTFPolicy = build_tf_policy( diff --git a/rllib/agents/a3c/a3c_torch_policy.py b/rllib/agents/a3c/a3c_torch_policy.py index 014c6a44e..b7394516c 100644 --- a/rllib/agents/a3c/a3c_torch_policy.py +++ b/rllib/agents/a3c/a3c_torch_policy.py @@ -13,18 +13,16 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.torch_policy_template import build_torch_policy -def actor_critic_loss(policy, batch_tensors): - logits, _ = policy.model({ - SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] - }) # TODO(ekl) seq lens shouldn't be None - values = policy.model.value_function() - dist = policy.dist_class(logits, policy.model) - log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS]) +def actor_critic_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + values = model.value_function() + dist = dist_class(logits, model) + log_probs = dist.logp(train_batch[SampleBatch.ACTIONS]) policy.entropy = dist.entropy().mean() - policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot( + policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot( log_probs.reshape(-1)) policy.value_err = F.mse_loss( - values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS]) + values.reshape(-1), train_batch[Postprocessing.VALUE_TARGETS]) overall_err = sum([ policy.pi_err, policy.config["vf_loss_coeff"] * policy.value_err, @@ -33,7 +31,7 @@ def actor_critic_loss(policy, batch_tensors): return overall_err -def loss_and_entropy_stats(policy, batch_tensors): +def loss_and_entropy_stats(policy, train_batch): return { "policy_entropy": policy.entropy.item(), "policy_loss": policy.pi_err.item(), diff --git a/rllib/agents/ddpg/ddpg_model.py b/rllib/agents/ddpg/ddpg_model.py index 4c1ecba0e..8fb7c4a9a 100644 --- a/rllib/agents/ddpg/ddpg_model.py +++ b/rllib/agents/ddpg/ddpg_model.py @@ -62,6 +62,7 @@ class DDPGModel(TFModelV2): shape=(self.action_dim, ), name="actions") def build_action_net(action_out): + assert action_out.dtype == tf.float32 activation = getattr(tf.nn, actor_hidden_activation) i = 0 for hidden in actor_hiddens: @@ -86,11 +87,29 @@ class DDPGModel(TFModelV2): name="action_out") action_scope = name + "/action_net" + # Save the scope object, since in eager we will execute this + # path repeatedly and there is no guarantee it will always be run + # in the same original scope. + with tf.variable_scope(action_scope) as action_scope_handle: + pass # TODO(ekl) use keras layers instead of variable scopes - def build_action_net_scope(model_out): - with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE): - return build_action_net(model_out) + if tf.executing_eagerly(): + # Have to use a variable store to reuse variables in eager mode + import tensorflow.contrib as tfc + store = tfc.eager.EagerVariableStore() + + def build_action_net_scope(model_out): + with store.as_default(): + with tf.variable_scope( + action_scope_handle, reuse=tf.AUTO_REUSE): + return build_action_net(model_out) + else: + + def build_action_net_scope(model_out): + with tf.variable_scope( + action_scope_handle, reuse=tf.AUTO_REUSE): + return build_action_net(model_out) pi_out = tf.keras.layers.Lambda(build_action_net_scope)(self.model_out) self.action_net = tf.keras.Model(self.model_out, pi_out) @@ -98,7 +117,8 @@ class DDPGModel(TFModelV2): # Noise vars for P network except for layer normalization vars if parameter_noise: - with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE): + assert not tf.executing_eagerly(), "eager p noise not implemented" + with tf.variable_scope(action_scope_handle, reuse=tf.AUTO_REUSE): self._build_parameter_noise([ var for var in self.action_net.variables if "LayerNorm" not in var.name @@ -126,12 +146,6 @@ class DDPGModel(TFModelV2): else: self.twin_q_net = None - def forward(self, input_dict, state, seq_lens): - """This generates the model_out tensor input. - - You must implement this as documented in modelv2.py.""" - raise NotImplementedError - def get_policy_output(self, model_out): """Return the (unscaled) output of the policy network. diff --git a/rllib/agents/ddpg/ddpg_policy.py b/rllib/agents/ddpg/ddpg_policy.py index a9fd52be9..a7617adb7 100644 --- a/rllib/agents/ddpg/ddpg_policy.py +++ b/rllib/agents/ddpg/ddpg_policy.py @@ -19,7 +19,8 @@ from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.policy.policy import Policy from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.utils import try_import_tf -from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip +from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \ + make_tf_callable tf = try_import_tf() logger = logging.getLogger(__name__) @@ -93,14 +94,6 @@ def postprocess_trajectory(policy, return _postprocess_dqn(policy, sample_batch) -def exploration_setting_inputs(policy): - return { - policy.stochastic: True, - policy.noise_scale: policy.cur_noise_scale, - policy.pure_exploration_phase: policy.cur_pure_exploration_phase, - } - - def build_action_output(policy, model, input_dict, obs_space, action_space, config): model_out, _ = model({ @@ -193,24 +186,24 @@ def build_action_output(policy, model, input_dict, obs_space, action_space, return actions, None -def actor_critic_loss(policy, batch_tensors): - model_out_t, _ = policy.model({ - "obs": batch_tensors[SampleBatch.CUR_OBS], +def actor_critic_loss(policy, model, _, train_batch): + model_out_t, _ = model({ + "obs": train_batch[SampleBatch.CUR_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) - model_out_tp1, _ = policy.model({ - "obs": batch_tensors[SampleBatch.NEXT_OBS], + model_out_tp1, _ = model({ + "obs": train_batch[SampleBatch.NEXT_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) target_model_out_tp1, _ = policy.target_model({ - "obs": batch_tensors[SampleBatch.NEXT_OBS], + "obs": train_batch[SampleBatch.NEXT_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) - policy_t = policy.model.get_policy_output(model_out_t) - policy_tp1 = policy.model.get_policy_output(model_out_tp1) + policy_t = model.get_policy_output(model_out_t) + policy_tp1 = model.get_policy_output(model_out_tp1) if policy.config["smooth_target_policy"]: target_noise_clip = policy.config["target_noise_clip"] @@ -226,14 +219,13 @@ def actor_critic_loss(policy, batch_tensors): policy_tp1_smoothed = policy_tp1 # q network evaluation - q_t = policy.model.get_q_values(model_out_t, - batch_tensors[SampleBatch.ACTIONS]) + q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) if policy.config["twin_q"]: - twin_q_t = policy.model.get_twin_q_values( - model_out_t, batch_tensors[SampleBatch.ACTIONS]) + twin_q_t = model.get_twin_q_values(model_out_t, + train_batch[SampleBatch.ACTIONS]) # Q-values for current policy (no noise) in given current state - q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t) + q_t_det_policy = model.get_q_values(model_out_t, policy_t) # target q network evaluation q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, @@ -248,12 +240,12 @@ def actor_critic_loss(policy, batch_tensors): q_tp1 = tf.minimum(q_tp1, twin_q_tp1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) - q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES], - tf.float32)) * q_tp1_best + q_tp1_best_masked = ( + 1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = tf.stop_gradient( - batch_tensors[SampleBatch.REWARDS] + + train_batch[SampleBatch.REWARDS] + policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked) # compute the error (potentially clipped) @@ -273,15 +265,17 @@ def actor_critic_loss(policy, batch_tensors): else: errors = 0.5 * tf.square(td_error) - critic_loss = policy.model.custom_loss( - tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors) + critic_loss = model.custom_loss( + tf.reduce_mean( + tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors), + train_batch) actor_loss = -tf.reduce_mean(q_t_det_policy) if policy.config["l2_reg"] is not None: - for var in policy.model.policy_variables(): + for var in model.policy_variables(): if "bias" not in var.name: actor_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var) - for var in policy.model.q_variables(): + for var in model.q_variables(): if "bias" not in var.name: critic_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var) @@ -299,19 +293,19 @@ def actor_critic_loss(policy, batch_tensors): def gradients(policy, optimizer, loss): if policy.config["grad_norm_clipping"] is not None: actor_grads_and_vars = minimize_and_clip( - policy._actor_optimizer, + optimizer, policy.actor_loss, var_list=policy.model.policy_variables(), clip_val=policy.config["grad_norm_clipping"]) critic_grads_and_vars = minimize_and_clip( - policy._critic_optimizer, + optimizer, policy.critic_loss, var_list=policy.model.q_variables(), clip_val=policy.config["grad_norm_clipping"]) else: - actor_grads_and_vars = policy._actor_optimizer.compute_gradients( + actor_grads_and_vars = optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) - critic_grads_and_vars = policy._critic_optimizer.compute_gradients( + critic_grads_and_vars = optimizer.compute_gradients( policy.critic_loss, var_list=policy.model.q_variables()) # save these for later use in build_apply_op policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars @@ -345,7 +339,7 @@ def apply_gradients(policy, optimizer, grads_and_vars): return tf.group(actor_op, critic_op) -def stats(policy, batch_tensors): +def stats(policy, train_batch): return { "td_error": tf.reduce_mean(policy.td_error), "actor_loss": tf.reduce_mean(policy.actor_loss), @@ -360,16 +354,32 @@ class ExplorationStateMixin(object): def __init__(self, obs_space, action_space, config): self.cur_noise_scale = 1.0 self.cur_pure_exploration_phase = False - self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") - self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale") - self.pure_exploration_phase = tf.placeholder( - tf.bool, (), name="pure_exploration_phase") + self.stochastic = tf.get_variable( + initializer=tf.constant_initializer(True), + name="stochastic", + shape=(), + trainable=False, + dtype=tf.bool) + self.noise_scale = tf.get_variable( + initializer=tf.constant_initializer(self.cur_noise_scale), + name="noise_scale", + shape=(), + trainable=False, + dtype=tf.float32) + self.pure_exploration_phase = tf.get_variable( + initializer=tf.constant_initializer( + self.cur_pure_exploration_phase), + name="pure_exploration_phase", + shape=(), + trainable=False, + dtype=tf.bool) def add_parameter_noise(self): if self.config["parameter_noise"]: self.get_session().run(self.model.add_noise_op) def adjust_param_noise_sigma(self, sample_batch): + assert not tf.executing_eagerly(), "eager not supported with p noise" # adjust the sigma of parameter space noise states, noisy_actions = [ list(x) for x in sample_batch.columns( @@ -396,9 +406,12 @@ class ExplorationStateMixin(object): # is a carry-over from DQN, which uses epsilon-greedy exploration # rather than adding action noise to the output of a policy network. self.cur_noise_scale = epsilon + self.noise_scale.load(self.cur_noise_scale, self.get_session()) def set_pure_exploration_phase(self, pure_exploration_phase): self.cur_pure_exploration_phase = pure_exploration_phase + self.pure_exploration_phase.load(self.cur_pure_exploration_phase, + self.get_session()) @override(Policy) def get_state(self): @@ -416,30 +429,27 @@ class ExplorationStateMixin(object): class TargetNetworkMixin(object): def __init__(self, config): - # update_target_fn will be called periodically to copy Q network to - # target Q network - self.tau_value = config.get("tau") - self.tau = tf.placeholder(tf.float32, (), name="tau") - update_target_expr = [] - model_vars = self.model.trainable_variables() - target_model_vars = self.target_model.trainable_variables() - assert len(model_vars) == len(target_model_vars), \ - (model_vars, target_model_vars) - for var, var_target in zip(model_vars, target_model_vars): - update_target_expr.append( - var_target.assign(self.tau * var + - (1.0 - self.tau) * var_target)) - logger.debug("Update target op {}".format(var_target)) - self.update_target_expr = tf.group(*update_target_expr) + @make_tf_callable(self.get_session()) + def update_target_fn(tau): + tau = tf.convert_to_tensor(tau, dtype=tf.float32) + update_target_expr = [] + model_vars = self.model.trainable_variables() + target_model_vars = self.target_model.trainable_variables() + assert len(model_vars) == len(target_model_vars), \ + (model_vars, target_model_vars) + for var, var_target in zip(model_vars, target_model_vars): + update_target_expr.append( + var_target.assign(tau * var + (1.0 - tau) * var_target)) + logger.debug("Update target op {}".format(var_target)) + return tf.group(*update_target_expr) # Hard initial update + self._do_update = update_target_fn self.update_target(tau=1.0) # support both hard and soft sync def update_target(self, tau=None): - tau = tau or self.tau_value - return self.get_session().run( - self.update_target_expr, feed_dict={self.tau: tau}) + self._do_update(np.float32(tau or self.config.get("tau"))) class ActorCriticOptimizerMixin(object): @@ -455,26 +465,27 @@ class ActorCriticOptimizerMixin(object): class ComputeTDErrorMixin(object): - def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - if not self.loss_initialized(): - return np.zeros_like(rew_t) + def __init__(self): + @make_tf_callable(self.get_session(), dynamic_shape=True) + def compute_td_error(obs_t, act_t, rew_t, obs_tp1, done_mask, + importance_weights): + if not self.loss_initialized(): + return tf.zeros_like(rew_t) - td_err = self.get_session().run( - self.td_error, - feed_dict={ - self.get_placeholder(SampleBatch.CUR_OBS): [ - np.array(ob) for ob in obs_t - ], - self.get_placeholder(SampleBatch.ACTIONS): act_t, - self.get_placeholder(SampleBatch.REWARDS): rew_t, - self.get_placeholder(SampleBatch.NEXT_OBS): [ - np.array(ob) for ob in obs_tp1 - ], - self.get_placeholder(SampleBatch.DONES): done_mask, - self.get_placeholder(PRIO_WEIGHTS): importance_weights - }) - return td_err + # Do forward pass on loss to update td error attribute + actor_critic_loss( + self, self.model, None, { + SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_t), + SampleBatch.ACTIONS: tf.convert_to_tensor(act_t), + SampleBatch.REWARDS: tf.convert_to_tensor(rew_t), + SampleBatch.NEXT_OBS: tf.convert_to_tensor(obs_tp1), + SampleBatch.DONES: tf.convert_to_tensor(done_mask), + PRIO_WEIGHTS: tf.convert_to_tensor(importance_weights), + }) + + return self.td_error + + self.compute_td_error = compute_td_error def setup_early_mixins(policy, obs_space, action_space, config): @@ -482,6 +493,10 @@ def setup_early_mixins(policy, obs_space, action_space, config): ActorCriticOptimizerMixin.__init__(policy, config) +def setup_mid_mixins(policy, obs_space, action_space, config): + ComputeTDErrorMixin.__init__(policy) + + def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, config) @@ -491,7 +506,6 @@ DDPGTFPolicy = build_tf_policy( get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, make_model=build_ddpg_model, postprocess_fn=postprocess_trajectory, - extra_action_feed_fn=exploration_setting_inputs, action_sampler_fn=build_action_output, loss_fn=actor_critic_loss, stats_fn=stats, @@ -503,5 +517,6 @@ DDPGTFPolicy = build_tf_policy( ComputeTDErrorMixin ], before_init=setup_early_mixins, + before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False) diff --git a/rllib/agents/ddpg/noop_model.py b/rllib/agents/ddpg/noop_model.py index d81dcf2b3..c953089c6 100644 --- a/rllib/agents/ddpg/noop_model.py +++ b/rllib/agents/ddpg/noop_model.py @@ -2,19 +2,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from ray.rllib.models import Model +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.utils.annotations import override from ray.rllib.utils import try_import_tf tf = try_import_tf() -class NoopModel(Model): +class NoopModel(TFModelV2): """Trivial model that just returns the obs flattened. This is the model used if use_state_preprocessor=False.""" - @override(Model) - def _build_layers_v2(self, input_dict, num_outputs, options): - out = tf.reshape(input_dict["obs"], [-1, num_outputs]) - return out, out + @override(TFModelV2) + def forward(self, input_dict, state, seq_lens): + return tf.cast(input_dict["obs_flat"], tf.float32), state diff --git a/rllib/agents/dqn/distributional_q_model.py b/rllib/agents/dqn/distributional_q_model.py index 70724dd30..d76014e88 100644 --- a/rllib/agents/dqn/distributional_q_model.py +++ b/rllib/agents/dqn/distributional_q_model.py @@ -151,15 +151,41 @@ class DistributionalQModel(TFModelV2): state_out, units=num_atoms, activation=None) return state_score - def build_action_value_in_scope(model_out): - with tf.variable_scope( - name + "/action_value", reuse=tf.AUTO_REUSE): - return build_action_value(model_out) + if tf.executing_eagerly(): + # Have to use a variable store to reuse variables in eager mode + import tensorflow.contrib as tfc + store = tfc.eager.EagerVariableStore() - def build_state_score_in_scope(model_out): - with tf.variable_scope(name + "/state_value", reuse=tf.AUTO_REUSE): - return build_state_score(model_out) + # Save the scope objects, since in eager we will execute this + # path repeatedly and there is no guarantee it will always be run + # in the same original scope. + with tf.variable_scope(name + "/action_value") as action_scope: + pass + with tf.variable_scope(name + "/state_value") as state_scope: + pass + def build_action_value_in_scope(model_out): + with store.as_default(): + with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE): + return build_action_value(model_out) + + def build_state_score_in_scope(model_out): + with store.as_default(): + with tf.variable_scope(state_scope, reuse=tf.AUTO_REUSE): + return build_state_score(model_out) + else: + + def build_action_value_in_scope(model_out): + with tf.variable_scope( + name + "/action_value", reuse=tf.AUTO_REUSE): + return build_action_value(model_out) + + def build_state_score_in_scope(model_out): + with tf.variable_scope( + name + "/state_value", reuse=tf.AUTO_REUSE): + return build_state_score(model_out) + + # TODO(ekl) we shouldn't need to use lambda layers here q_out = tf.keras.layers.Lambda(build_action_value_in_scope)( self.model_out) self.q_value_head = tf.keras.Model(self.model_out, q_out) @@ -171,12 +197,6 @@ class DistributionalQModel(TFModelV2): self.state_value_head = tf.keras.Model(self.model_out, state_out) self.register_variables(self.state_value_head.variables) - def forward(self, input_dict, state, seq_lens): - """This generates the model_out tensor input. - - You must implement this as documented in modelv2.py.""" - raise NotImplementedError - def get_q_value_distributions(self, model_out): """Returns distributional values for Q(s, a) given a state embedding. diff --git a/rllib/agents/dqn/dqn_policy.py b/rllib/agents/dqn/dqn_policy.py index 168e45348..7600419d6 100644 --- a/rllib/agents/dqn/dqn_policy.py +++ b/rllib/agents/dqn/dqn_policy.py @@ -19,6 +19,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.tf_ops import huber_loss, reduce_mean_ignore_inf, \ minimize_and_clip from ray.rllib.utils import try_import_tf +from ray.rllib.utils.tf_ops import make_tf_callable tf = try_import_tf() @@ -96,7 +97,8 @@ class QLoss(object): self.td_error = ( q_t_selected - tf.stop_gradient(q_t_selected_target)) self.loss = tf.reduce_mean( - importance_weights * huber_loss(self.td_error)) + tf.cast(importance_weights, tf.float32) * huber_loss( + self.td_error)) self.stats = { "mean_q": tf.reduce_mean(q_t_selected), "min_q": tf.reduce_min(q_t_selected), @@ -106,7 +108,7 @@ class QLoss(object): class QValuePolicy(object): - def __init__(self, q_values, observations, num_actions, stochastic, eps, + def __init__(self, q_values, observations, num_actions, cur_epsilon, softmax, softmax_temp, model_config): if softmax: action_dist = Categorical(q_values / softmax_temp) @@ -126,35 +128,35 @@ class QValuePolicy(object): tf.multinomial(random_valid_action_logits, 1), axis=1) chose_random = tf.random_uniform( - tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps - stochastic_actions = tf.where(chose_random, random_actions, - deterministic_actions) - self.action = tf.cond(stochastic, lambda: stochastic_actions, - lambda: deterministic_actions) + tf.stack([batch_size]), minval=0, maxval=1, + dtype=tf.float32) < cur_epsilon + self.action = tf.where(chose_random, random_actions, + deterministic_actions) self.action_prob = None class ComputeTDErrorMixin(object): - def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - if not self.loss_initialized(): - return np.zeros_like(rew_t) + def __init__(self): + @make_tf_callable(self.get_session(), dynamic_shape=True) + def compute_td_error(obs_t, act_t, rew_t, obs_tp1, done_mask, + importance_weights): + if not self.loss_initialized(): + return tf.zeros_like(rew_t) - td_err = self.get_session().run( - self.q_loss.td_error, - feed_dict={ - self.get_placeholder(SampleBatch.CUR_OBS): [ - np.array(ob) for ob in obs_t - ], - self.get_placeholder(SampleBatch.ACTIONS): act_t, - self.get_placeholder(SampleBatch.REWARDS): rew_t, - self.get_placeholder(SampleBatch.NEXT_OBS): [ - np.array(ob) for ob in obs_tp1 - ], - self.get_placeholder(SampleBatch.DONES): done_mask, - self.get_placeholder(PRIO_WEIGHTS): importance_weights, - }) - return td_err + # Do forward pass on loss to update td error attribute + build_q_losses( + self, self.model, None, { + SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_t), + SampleBatch.ACTIONS: tf.convert_to_tensor(act_t), + SampleBatch.REWARDS: tf.convert_to_tensor(rew_t), + SampleBatch.NEXT_OBS: tf.convert_to_tensor(obs_tp1), + SampleBatch.DONES: tf.convert_to_tensor(done_mask), + PRIO_WEIGHTS: tf.convert_to_tensor(importance_weights), + }) + + return self.q_loss.td_error + + self.compute_td_error = compute_td_error def postprocess_trajectory(policy, @@ -174,8 +176,8 @@ def postprocess_trajectory(policy, entropy(clean_action_distribution.T, noisy_action_distribution.T)) policy.pi_distance = distance_in_action_space if (distance_in_action_space < - -np.log(1 - policy.cur_epsilon + - policy.cur_epsilon / policy.num_actions)): + -np.log(1 - policy.cur_epsilon_value + + policy.cur_epsilon_value / policy.num_actions)): policy.parameter_noise_sigma_val *= 1.01 else: policy.parameter_noise_sigma_val /= 1.01 @@ -254,9 +256,8 @@ def build_q_networks(policy, q_model, input_dict, obs_space, action_space, # Action outputs qvp = QValuePolicy(q_values, input_dict[SampleBatch.CUR_OBS], - action_space.n, policy.stochastic, policy.eps, - config["soft_q"], config["softmax_temp"], - config["model"]) + action_space.n, policy.cur_epsilon, config["soft_q"], + config["softmax_temp"], config["model"]) policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob actions = policy.output_actions @@ -302,22 +303,22 @@ def _build_parameter_noise(policy, pnet_params): policy.pi_distance = None -def build_q_losses(policy, batch_tensors): +def build_q_losses(policy, model, _, train_batch): config = policy.config # q network evaluation q_t, q_logits_t, q_dist_t = _compute_q_values( - policy, policy.q_model, batch_tensors[SampleBatch.CUR_OBS], + policy, policy.q_model, train_batch[SampleBatch.CUR_OBS], policy.observation_space, policy.action_space) # target q network evalution q_tp1, q_logits_tp1, q_dist_tp1 = _compute_q_values( - policy, policy.target_q_model, batch_tensors[SampleBatch.NEXT_OBS], + policy, policy.target_q_model, train_batch[SampleBatch.NEXT_OBS], policy.observation_space, policy.action_space) policy.target_q_func_vars = policy.target_q_model.variables() # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot( - tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32), + tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) q_logits_t_selected = tf.reduce_sum( @@ -328,7 +329,7 @@ def build_q_losses(policy, batch_tensors): q_tp1_using_online_net, q_logits_tp1_using_online_net, \ q_dist_tp1_using_online_net = _compute_q_values( policy, policy.q_model, - batch_tensors[SampleBatch.NEXT_OBS], + train_batch[SampleBatch.NEXT_OBS], policy.observation_space, policy.action_space) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best_one_hot_selection = tf.one_hot(q_tp1_best_using_online_net, @@ -345,8 +346,8 @@ def build_q_losses(policy, batch_tensors): policy.q_loss = QLoss( q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best, - batch_tensors[PRIO_WEIGHTS], batch_tensors[SampleBatch.REWARDS], - tf.cast(batch_tensors[SampleBatch.DONES], + train_batch[PRIO_WEIGHTS], train_batch[SampleBatch.REWARDS], + tf.cast(train_batch[SampleBatch.DONES], tf.float32), config["gamma"], config["n_step"], config["num_atoms"], config["v_min"], config["v_max"]) @@ -372,14 +373,7 @@ def clip_gradients(policy, optimizer, loss): return grads_and_vars -def exploration_setting_inputs(policy): - return { - policy.stochastic: True, - policy.eps: policy.cur_epsilon, - } - - -def build_q_stats(policy, batch_tensors): +def build_q_stats(policy, batch): return dict({ "cur_lr": tf.cast(policy.cur_lr, tf.float64), }, **policy.q_loss.stats) @@ -390,6 +384,10 @@ def setup_early_mixins(policy, obs_space, action_space, config): ExplorationStateMixin.__init__(policy, obs_space, action_space, config) +def setup_mid_mixins(policy, obs_space, action_space, config): + ComputeTDErrorMixin.__init__(policy) + + def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, obs_space, action_space, config) @@ -494,10 +492,10 @@ DQNTFPolicy = build_tf_policy( postprocess_fn=postprocess_trajectory, optimizer_fn=adam_optimizer, gradients_fn=clip_gradients, - extra_action_feed_fn=exploration_setting_inputs, extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error}, before_init=setup_early_mixins, + before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False, mixins=[ diff --git a/rllib/agents/dqn/simple_q_model.py b/rllib/agents/dqn/simple_q_model.py index 2cf68c3cf..257097799 100644 --- a/rllib/agents/dqn/simple_q_model.py +++ b/rllib/agents/dqn/simple_q_model.py @@ -2,9 +2,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from ray.rllib.models.modelv2 import ModelV2 from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils.annotations import override from ray.rllib.utils import try_import_tf tf = try_import_tf() @@ -60,13 +58,6 @@ class SimpleQModel(TFModelV2): self.q_value_head = tf.keras.Model(self.model_out, q_out) self.register_variables(self.q_value_head.variables) - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - """This generates the model_out tensor input. - - You must implement this as documented in modelv2.py.""" - raise NotImplementedError - def get_q_values(self, model_out): """Returns Q(s, a) given a feature tensor for the state. diff --git a/rllib/agents/dqn/simple_q_policy.py b/rllib/agents/dqn/simple_q_policy.py index 44fd18853..297e3b54f 100644 --- a/rllib/agents/dqn/simple_q_policy.py +++ b/rllib/agents/dqn/simple_q_policy.py @@ -16,7 +16,7 @@ from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils import try_import_tf -from ray.rllib.utils.tf_ops import huber_loss +from ray.rllib.utils.tf_ops import huber_loss, make_tf_callable tf = try_import_tf() logger = logging.getLogger(__name__) @@ -27,20 +27,27 @@ Q_TARGET_SCOPE = "target_q_func" class ExplorationStateMixin(object): def __init__(self, obs_space, action_space, config): - self.cur_epsilon = 1.0 - self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") - self.eps = tf.placeholder(tf.float32, (), name="eps") + # Python value, should always be same as the TF variable + self.cur_epsilon_value = 1.0 + self.cur_epsilon = tf.get_variable( + initializer=tf.constant_initializer(self.cur_epsilon_value), + name="eps", + shape=(), + trainable=False, + dtype=tf.float32) def add_parameter_noise(self): if self.config["parameter_noise"]: self.sess.run(self.add_noise_op) def set_epsilon(self, epsilon): - self.cur_epsilon = epsilon + self.cur_epsilon_value = epsilon + self.cur_epsilon.load( + self.cur_epsilon_value, session=self.get_session()) @override(Policy) def get_state(self): - return [TFPolicy.get_state(self), self.cur_epsilon] + return [TFPolicy.get_state(self), self.cur_epsilon_value] @override(Policy) def set_state(self, state): @@ -50,18 +57,20 @@ class ExplorationStateMixin(object): class TargetNetworkMixin(object): def __init__(self, obs_space, action_space, config): - # update_target_fn will be called periodically to copy Q network to - # target Q network - update_target_expr = [] - assert len(self.q_func_vars) == len(self.target_q_func_vars), \ - (self.q_func_vars, self.target_q_func_vars) - for var, var_target in zip(self.q_func_vars, self.target_q_func_vars): - update_target_expr.append(var_target.assign(var)) - logger.debug("Update target op {}".format(var_target)) - self.update_target_expr = tf.group(*update_target_expr) + @make_tf_callable(self.get_session()) + def do_update(): + # update_target_fn will be called periodically to copy Q network to + # target Q network + update_target_expr = [] + assert len(self.q_func_vars) == len(self.target_q_func_vars), \ + (self.q_func_vars, self.target_q_func_vars) + for var, var_target in zip(self.q_func_vars, + self.target_q_func_vars): + update_target_expr.append(var_target.assign(var)) + logger.debug("Update target op {}".format(var_target)) + return tf.group(*update_target_expr) - def update_target(self): - return self.get_session().run(self.update_target_expr) + self.update_target = do_update def build_q_models(policy, obs_space, action_space, config): @@ -123,43 +132,41 @@ def build_action_sampler(policy, q_model, input_dict, obs_space, action_space, chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, - dtype=tf.float32) < policy.eps + dtype=tf.float32) < policy.cur_epsilon stochastic_actions = tf.where(chose_random, random_actions, deterministic_actions) - action = tf.cond(policy.stochastic, lambda: stochastic_actions, - lambda: deterministic_actions) action_logp = None - return action, action_logp + return stochastic_actions, action_logp -def build_q_losses(policy, batch_tensors): +def build_q_losses(policy, model, dist_class, train_batch): # q network evaluation q_t = _compute_q_values(policy, policy.q_model, - batch_tensors[SampleBatch.CUR_OBS], + train_batch[SampleBatch.CUR_OBS], policy.observation_space, policy.action_space) # target q network evalution q_tp1 = _compute_q_values(policy, policy.target_q_model, - batch_tensors[SampleBatch.NEXT_OBS], + train_batch[SampleBatch.NEXT_OBS], policy.observation_space, policy.action_space) policy.target_q_func_vars = policy.target_q_model.variables() # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot( - tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32), + tf.cast(train_batch[SampleBatch.ACTIONS], tf.int32), policy.action_space.n) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) # compute estimate of best possible value starting from state at t + 1 - dones = tf.cast(batch_tensors[SampleBatch.DONES], tf.float32) + dones = tf.cast(train_batch[SampleBatch.DONES], tf.float32) q_tp1_best_one_hot_selection = tf.one_hot( tf.argmax(q_tp1, 1), policy.action_space.n) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_tp1_best_masked = (1.0 - dones) * q_tp1_best # compute RHS of bellman equation - q_t_selected_target = (batch_tensors[SampleBatch.REWARDS] + + q_t_selected_target = (train_batch[SampleBatch.REWARDS] + policy.config["gamma"] * q_tp1_best_masked) # compute the error (potentially clipped) @@ -181,13 +188,6 @@ def _compute_q_values(policy, model, obs, obs_space, action_space): return model.get_q_values(model_out) -def exploration_setting_inputs(policy): - return { - policy.stochastic: True, - policy.eps: policy.cur_epsilon, - } - - def setup_early_mixins(policy, obs_space, action_space, config): ExplorationStateMixin.__init__(policy, obs_space, action_space, config) @@ -202,7 +202,6 @@ SimpleQPolicy = build_tf_policy( make_model=build_q_models, action_sampler_fn=build_action_sampler, loss_fn=build_q_losses, - extra_action_feed_fn=exploration_setting_inputs, extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values}, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, before_init=setup_early_mixins, diff --git a/rllib/agents/impala/vtrace_policy.py b/rllib/agents/impala/vtrace_policy.py index d13bbea00..7e51a9959 100644 --- a/rllib/agents/impala/vtrace_policy.py +++ b/rllib/agents/impala/vtrace_policy.py @@ -115,11 +115,12 @@ class VTraceLoss(object): self.entropy * entropy_coeff) -def _make_time_major(policy, tensor, drop_last=False): +def _make_time_major(policy, seq_lens, tensor, drop_last=False): """Swaps batch and trajectory axis. Arguments: policy: Policy reference + seq_lens: Sequence lengths if recurrent or None tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. @@ -129,10 +130,12 @@ def _make_time_major(policy, tensor, drop_last=False): swapped axes. """ if isinstance(tensor, list): - return [_make_time_major(policy, t, drop_last) for t in tensor] + return [ + _make_time_major(policy, seq_lens, t, drop_last) for t in tensor + ] - if policy.state_in: - B = tf.shape(policy.seq_lens)[0] + if policy.is_recurrent(): + B = tf.shape(seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut @@ -150,7 +153,10 @@ def _make_time_major(policy, tensor, drop_last=False): return res -def build_vtrace_loss(policy, batch_tensors): +def build_vtrace_loss(policy, model, dist_class, train_batch): + model_out, _ = model.from_batch(train_batch) + action_dist = dist_class(model_out, model) + if isinstance(policy.action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [policy.action_space.n] @@ -163,22 +169,22 @@ def build_vtrace_loss(policy, batch_tensors): output_hidden_shape = 1 def make_time_major(*args, **kw): - return _make_time_major(policy, *args, **kw) + return _make_time_major(policy, train_batch.get("seq_lens"), *args, + **kw) - actions = batch_tensors[SampleBatch.ACTIONS] - dones = batch_tensors[SampleBatch.DONES] - rewards = batch_tensors[SampleBatch.REWARDS] - behaviour_action_logp = batch_tensors[ACTION_LOGP] - behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS] + actions = train_batch[SampleBatch.ACTIONS] + dones = train_batch[SampleBatch.DONES] + rewards = train_batch[SampleBatch.REWARDS] + behaviour_action_logp = train_batch[ACTION_LOGP] + behaviour_logits = train_batch[BEHAVIOUR_LOGITS] unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) - unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1) - action_dist = policy.action_dist - values = policy.value_function + unpacked_outputs = tf.split(model_out, output_hidden_shape, axis=1) + values = model.value_function() - if policy.state_in: - max_seq_len = tf.reduce_max(policy.seq_lens) - 1 - mask = tf.sequence_mask(policy.seq_lens, max_seq_len) + if policy.is_recurrent(): + max_seq_len = tf.reduce_max(train_batch["seq_lens"]) - 1 + mask = tf.sequence_mask(train_batch["seq_lens"], max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) @@ -204,8 +210,8 @@ def build_vtrace_loss(policy, batch_tensors): rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], - dist_class=Categorical if is_multidiscrete else policy.dist_class, - model=policy.model, + dist_class=Categorical if is_multidiscrete else dist_class, + model=model, valid_mask=make_time_major(mask, drop_last=True), config=policy.config, vf_loss_coeff=policy.config["vf_loss_coeff"], @@ -216,16 +222,19 @@ def build_vtrace_loss(policy, batch_tensors): return policy.loss.total_loss -def stats(policy, batch_tensors): +def stats(policy, train_batch): values_batched = _make_time_major( - policy, policy.value_function, drop_last=policy.config["vtrace"]) + policy, + train_batch.get("seq_lens"), + policy.model.value_function(), + drop_last=policy.config["vtrace"]) return { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "entropy": policy.loss.entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), - "var_gnorm": tf.global_norm(policy.var_list), + "var_gnorm": tf.global_norm(policy.model.trainable_variables()), "vf_loss": policy.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(policy.loss.value_targets, [-1]), @@ -233,7 +242,7 @@ def stats(policy, batch_tensors): } -def grad_stats(policy, batch_tensors, grads): +def grad_stats(policy, train_batch, grads): return { "grad_gnorm": tf.global_norm(grads), } @@ -249,7 +258,7 @@ def postprocess_trajectory(policy, def add_behaviour_logits(policy): - return {BEHAVIOUR_LOGITS: policy.model_out} + return {BEHAVIOUR_LOGITS: policy.model.last_output()} def validate_config(policy, obs_space, action_space, config): @@ -267,36 +276,18 @@ def choose_optimizer(policy, config): def clip_gradients(policy, optimizer, loss): - grads = tf.gradients(loss, policy.var_list) + grads_and_vars = optimizer.compute_gradients( + loss, policy.model.trainable_variables()) + grads = [g for (g, v) in grads_and_vars] policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) - clipped_grads = list(zip(policy.grads, policy.var_list)) + clipped_grads = list(zip(policy.grads, policy.model.trainable_variables())) return clipped_grads -class ValueNetworkMixin(object): - def __init__(self): - self.value_function = self.model.value_function() - self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - tf.get_variable_scope().name) - - def value(self, ob, *args): - feed_dict = { - self.get_placeholder(SampleBatch.CUR_OBS): [ob], - self.seq_lens: [1] - } - assert len(args) == len(self.state_in), \ - (args, self.state_in) - for k, v in zip(self.state_in, args): - feed_dict[k] = v - vf = self.get_session().run(self.value_function, feed_dict) - return vf[0] - - def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"], config["entropy_coeff_schedule"]) - ValueNetworkMixin.__init__(policy) VTraceTFPolicy = build_tf_policy( @@ -311,5 +302,5 @@ VTraceTFPolicy = build_tf_policy( extra_action_fetches_fn=add_behaviour_logits, before_init=validate_config, before_loss_init=setup_mixins, - mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin], + mixins=[LearningRateSchedule, EntropyCoeffSchedule], get_batch_divisibility_req=lambda p: p.config["sample_batch_size"]) diff --git a/rllib/agents/pg/pg_policy.py b/rllib/agents/pg/pg_policy.py index 7cca61392..1b8f6a4b6 100644 --- a/rllib/agents/pg/pg_policy.py +++ b/rllib/agents/pg/pg_policy.py @@ -13,13 +13,15 @@ tf = try_import_tf() # The basic policy gradients loss -def policy_gradient_loss(policy, batch_tensors): - actions = batch_tensors[SampleBatch.ACTIONS] - advantages = batch_tensors[Postprocessing.ADVANTAGES] - return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages) +def policy_gradient_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + return -tf.reduce_mean( + action_dist.logp(train_batch[SampleBatch.ACTIONS]) * + train_batch[Postprocessing.ADVANTAGES]) -# This adds the "advantages" column to the sample batch. +# This adds the "advantages" column to the sampletrain_batch. def postprocess_advantages(policy, sample_batch, other_agent_batches=None, diff --git a/rllib/agents/pg/torch_pg_policy.py b/rllib/agents/pg/torch_pg_policy.py index 1e1fca7c4..362a86b9e 100644 --- a/rllib/agents/pg/torch_pg_policy.py +++ b/rllib/agents/pg/torch_pg_policy.py @@ -9,14 +9,12 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.torch_policy_template import build_torch_policy -def pg_torch_loss(policy, batch_tensors): - logits, _ = policy.model({ - SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] - }) - action_dist = policy.dist_class(logits, policy.model) - log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS]) +def pg_torch_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) # save the error in the policy object - policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot( + policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot( log_probs.reshape(-1)) return policy.pi_err @@ -29,7 +27,7 @@ def postprocess_advantages(policy, sample_batch, 0.0, policy.config["gamma"], use_gae=False) -def pg_loss_stats(policy, batch_tensors): +def pg_loss_stats(policy, train_batch): # the error is recorded when computing the loss return {"policy_loss": policy.pi_err.item()} diff --git a/rllib/agents/ppo/appo_policy.py b/rllib/agents/ppo/appo_policy.py index 6ecc8189a..eb3f5ad7f 100644 --- a/rllib/agents/ppo/appo_policy.py +++ b/rllib/agents/ppo/appo_policy.py @@ -12,8 +12,7 @@ import gym from ray.rllib.agents.impala import vtrace from ray.rllib.agents.impala.vtrace_policy import _make_time_major, \ - BEHAVIOUR_LOGITS, clip_gradients, \ - validate_config, choose_optimizer, ValueNetworkMixin + BEHAVIOUR_LOGITS, clip_gradients, validate_config, choose_optimizer from ray.rllib.evaluation.postprocessing import Postprocessing from ray.rllib.models.tf.tf_action_dist import Categorical from ray.rllib.policy.sample_batch import SampleBatch @@ -21,9 +20,10 @@ from ray.rllib.evaluation.postprocessing import compute_advantages from ray.rllib.utils import try_import_tf from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.policy.tf_policy import LearningRateSchedule -from ray.rllib.agents.ppo.ppo_policy import KLCoeffMixin +from ray.rllib.agents.ppo.ppo_policy import KLCoeffMixin, ValueNetworkMixin from ray.rllib.models import ModelCatalog from ray.rllib.utils.explained_variance import explained_variance +from ray.rllib.utils.tf_ops import make_tf_callable tf = try_import_tf() @@ -204,10 +204,12 @@ class VTraceSurrogateLoss(object): def build_appo_model(policy, obs_space, action_space, config): + _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) + policy.model = ModelCatalog.get_model_v2( obs_space, action_space, - policy.logit_dim, + logit_dim, config["model"], name=POLICY_SCOPE, framework="tf") @@ -215,7 +217,7 @@ def build_appo_model(policy, obs_space, action_space, config): policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, - policy.logit_dim, + logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="tf") @@ -223,7 +225,10 @@ def build_appo_model(policy, obs_space, action_space, config): return policy.model -def build_appo_surrogate_loss(policy, batch_tensors): +def build_appo_surrogate_loss(policy, model, dist_class, train_batch): + model_out, _ = model.from_batch(train_batch) + action_dist = dist_class(model_out, model) + if isinstance(policy.action_space, gym.spaces.Discrete): is_multidiscrete = False output_hidden_shape = [policy.action_space.n] @@ -236,41 +241,38 @@ def build_appo_surrogate_loss(policy, batch_tensors): output_hidden_shape = 1 def make_time_major(*args, **kw): - return _make_time_major(policy, *args, **kw) + return _make_time_major(policy, train_batch.get("seq_lens"), *args, + **kw) - actions = batch_tensors[SampleBatch.ACTIONS] - dones = batch_tensors[SampleBatch.DONES] - rewards = batch_tensors[SampleBatch.REWARDS] + actions = train_batch[SampleBatch.ACTIONS] + dones = train_batch[SampleBatch.DONES] + rewards = train_batch[SampleBatch.REWARDS] + behaviour_logits = train_batch[BEHAVIOUR_LOGITS] - behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS] - - policy.target_model_out, _ = policy.target_model( - policy.input_dict, policy.state_in, policy.seq_lens) - old_policy_behaviour_logits = tf.stop_gradient(policy.target_model_out) + target_model_out, _ = policy.target_model.from_batch(train_batch) + old_policy_behaviour_logits = tf.stop_gradient(target_model_out) unpacked_behaviour_logits = tf.split( behaviour_logits, output_hidden_shape, axis=1) unpacked_old_policy_behaviour_logits = tf.split( old_policy_behaviour_logits, output_hidden_shape, axis=1) - unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1) - action_dist = policy.action_dist - old_policy_action_dist = policy.dist_class(old_policy_behaviour_logits, - policy.model) - prev_action_dist = policy.dist_class(behaviour_logits, policy.model) - values = policy.value_function + unpacked_outputs = tf.split(model_out, output_hidden_shape, axis=1) + old_policy_action_dist = dist_class(old_policy_behaviour_logits, model) + prev_action_dist = dist_class(behaviour_logits, policy.model) + values = policy.model.value_function() policy.model_vars = policy.model.variables() policy.target_model_vars = policy.target_model.variables() - if policy.state_in: - max_seq_len = tf.reduce_max(policy.seq_lens) - 1 - mask = tf.sequence_mask(policy.seq_lens, max_seq_len) + if policy.is_recurrent(): + max_seq_len = tf.reduce_max(train_batch["seq_lens"]) - 1 + mask = tf.sequence_mask(train_batch["seq_lens"], max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) if policy.config["vtrace"]: - logger.info("Using V-Trace surrogate loss (vtrace=True)") + logger.debug("Using V-Trace surrogate loss (vtrace=True)") # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims( @@ -302,7 +304,7 @@ def build_appo_surrogate_loss(policy, batch_tensors): rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], - dist_class=Categorical if is_multidiscrete else policy.dist_class, + dist_class=Categorical if is_multidiscrete else dist_class, model=policy.model, valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=policy.config["vf_loss_coeff"], @@ -314,7 +316,7 @@ def build_appo_surrogate_loss(policy, batch_tensors): cur_kl_coeff=policy.kl_coeff, use_kl_loss=policy.config["use_kl_loss"]) else: - logger.info("Using PPO surrogate loss (vtrace=False)") + logger.debug("Using PPO surrogate loss (vtrace=False)") # Prepare KL for Loss mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist)) @@ -327,10 +329,9 @@ def build_appo_surrogate_loss(policy, batch_tensors): actions_entropy=make_time_major(action_dist.multi_entropy()), values=make_time_major(values), valid_mask=make_time_major(mask), - advantages=make_time_major( - batch_tensors[Postprocessing.ADVANTAGES]), + advantages=make_time_major(train_batch[Postprocessing.ADVANTAGES]), value_targets=make_time_major( - batch_tensors[Postprocessing.VALUE_TARGETS]), + train_batch[Postprocessing.VALUE_TARGETS]), vf_loss_coeff=policy.config["vf_loss_coeff"], entropy_coeff=policy.config["entropy_coeff"], clip_param=policy.config["clip_param"], @@ -340,15 +341,18 @@ def build_appo_surrogate_loss(policy, batch_tensors): return policy.loss.total_loss -def stats(policy, batch_tensors): +def stats(policy, train_batch): values_batched = _make_time_major( - policy, policy.value_function, drop_last=policy.config["vtrace"]) + policy, + train_batch.get("seq_lens"), + policy.model.value_function(), + drop_last=policy.config["vtrace"]) stats_dict = { "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "entropy": policy.loss.entropy, - "var_gnorm": tf.global_norm(policy.var_list), + "var_gnorm": tf.global_norm(policy.model.trainable_variables()), "vf_loss": policy.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(policy.loss.value_targets, [-1]), @@ -377,9 +381,12 @@ def postprocess_trajectory(policy, last_r = 0.0 else: next_state = [] - for i in range(len(policy.state_in)): + for i in range(policy.num_state_tensors()): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) - last_r = policy.value(sample_batch["new_obs"][-1], *next_state) + last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1], + sample_batch[SampleBatch.ACTIONS][-1], + sample_batch[SampleBatch.REWARDS][-1], + *next_state) batch = compute_advantages( sample_batch, last_r, @@ -393,9 +400,9 @@ def postprocess_trajectory(policy, def add_values_and_logits(policy): - out = {BEHAVIOUR_LOGITS: policy.model_out} + out = {BEHAVIOUR_LOGITS: policy.model.last_output()} if not policy.config["vtrace"]: - out[SampleBatch.VF_PREDS] = policy.value_function + out[SampleBatch.VF_PREDS] = policy.model.value_function() return out @@ -406,20 +413,23 @@ class TargetNetworkMixin(object): are importance sampled w.r. to the target network to ensure a more stable pi_old in PPO. """ - assign_ops = [] - assert len(self.model_vars) == len(self.target_model_vars) - for var, var_target in zip(self.model_vars, self.target_model_vars): - assign_ops.append(var_target.assign(var)) - self.update_target_network = tf.group(*assign_ops) - def update_target(self): - return self.get_session().run(self.update_target_network) + @make_tf_callable(self.get_session()) + def do_update(): + assign_ops = [] + assert len(self.model_vars) == len(self.target_model_vars) + for var, var_target in zip(self.model_vars, + self.target_model_vars): + assign_ops.append(var_target.assign(var)) + return tf.group(*assign_ops) + + self.update_target = do_update def setup_mixins(policy, obs_space, action_space, config): LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"]) KLCoeffMixin.__init__(policy, config) - ValueNetworkMixin.__init__(policy) + ValueNetworkMixin.__init__(policy, obs_space, action_space, config) def setup_late_mixins(policy, obs_space, action_space, config): diff --git a/rllib/agents/ppo/ppo.py b/rllib/agents/ppo/ppo.py index 7ccb2eff1..8cdfb4ae2 100644 --- a/rllib/agents/ppo/ppo.py +++ b/rllib/agents/ppo/ppo.py @@ -8,7 +8,9 @@ from ray.rllib.agents import with_common_config from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer +from ray.rllib.utils import try_import_tf +tf = try_import_tf() logger = logging.getLogger(__name__) # yapf: disable @@ -71,7 +73,8 @@ def choose_policy_optimizer(workers, config): return SyncSamplesOptimizer( workers, num_sgd_iter=config["num_sgd_iter"], - train_batch_size=config["train_batch_size"]) + train_batch_size=config["train_batch_size"], + sgd_minibatch_size=config["sgd_minibatch_size"]) return LocalMultiGPUOptimizer( workers, @@ -140,8 +143,10 @@ def validate_config(config): "simple_optimizer=True if this doesn't work for you.") if config["simple_optimizer"]: logger.warning( - "Using the simple non-minibatch optimizer. This will greatly " + "Using the simple minibatch optimizer. This will significantly " "reduce performance, consider simple_optimizer=False.") + elif tf and tf.executing_eagerly(): + config["simple_optimizer"] = True # multi-gpu not supported PPOTrainer = build_trainer( diff --git a/rllib/agents/ppo/ppo_policy.py b/rllib/agents/ppo/ppo_policy.py index 60d05a5a6..16bb7dbd0 100644 --- a/rllib/agents/ppo/ppo_policy.py +++ b/rllib/agents/ppo/ppo_policy.py @@ -12,6 +12,7 @@ from ray.rllib.policy.tf_policy import LearningRateSchedule, \ EntropyCoeffSchedule, ACTION_LOGP from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.explained_variance import explained_variance +from ray.rllib.utils.tf_ops import make_tf_callable from ray.rllib.utils import try_import_tf tf = try_import_tf() @@ -111,27 +112,30 @@ class PPOLoss(object): self.loss = loss -def ppo_surrogate_loss(policy, batch_tensors): - if policy.state_in: - max_seq_len = tf.reduce_max(policy.seq_lens) - mask = tf.sequence_mask(policy.seq_lens, max_seq_len) +def ppo_surrogate_loss(policy, model, dist_class, train_batch): + logits, state = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + + if state: + max_seq_len = tf.reduce_max(train_batch["seq_lens"]) + mask = tf.sequence_mask(train_batch["seq_lens"], max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like( - batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool) + train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) policy.loss_obj = PPOLoss( policy.action_space, - policy.dist_class, - policy.model, - batch_tensors[Postprocessing.VALUE_TARGETS], - batch_tensors[Postprocessing.ADVANTAGES], - batch_tensors[SampleBatch.ACTIONS], - batch_tensors[BEHAVIOUR_LOGITS], - batch_tensors[ACTION_LOGP], - batch_tensors[SampleBatch.VF_PREDS], - policy.action_dist, - policy.value_function, + dist_class, + model, + train_batch[Postprocessing.VALUE_TARGETS], + train_batch[Postprocessing.ADVANTAGES], + train_batch[SampleBatch.ACTIONS], + train_batch[BEHAVIOUR_LOGITS], + train_batch[ACTION_LOGP], + train_batch[SampleBatch.VF_PREDS], + action_dist, + model.value_function(), policy.kl_coeff, mask, entropy_coeff=policy.entropy_coeff, @@ -144,7 +148,7 @@ def ppo_surrogate_loss(policy, batch_tensors): return policy.loss_obj.loss -def kl_and_loss_stats(policy, batch_tensors): +def kl_and_loss_stats(policy, train_batch): return { "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64), "cur_lr": tf.cast(policy.cur_lr, tf.float64), @@ -152,8 +156,8 @@ def kl_and_loss_stats(policy, batch_tensors): "policy_loss": policy.loss_obj.mean_policy_loss, "vf_loss": policy.loss_obj.mean_vf_loss, "vf_explained_var": explained_variance( - batch_tensors[Postprocessing.VALUE_TARGETS], - policy.value_function), + train_batch[Postprocessing.VALUE_TARGETS], + policy.model.value_function()), "kl": policy.loss_obj.mean_kl, "entropy": policy.loss_obj.mean_entropy, "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), @@ -161,10 +165,10 @@ def kl_and_loss_stats(policy, batch_tensors): def vf_preds_and_logits_fetches(policy): - """Adds value function and logits outputs to experience batches.""" + """Adds value function and logits outputs to experience train_batches.""" return { - SampleBatch.VF_PREDS: policy.value_function, - BEHAVIOUR_LOGITS: policy.model_out, + SampleBatch.VF_PREDS: policy.model.value_function(), + BEHAVIOUR_LOGITS: policy.model.last_output(), } @@ -179,7 +183,7 @@ def postprocess_ppo_gae(policy, last_r = 0.0 else: next_state = [] - for i in range(len(policy.state_in)): + for i in range(policy.num_state_tensors()): next_state.append([sample_batch["state_out_{}".format(i)][-1]]) last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1], sample_batch[SampleBatch.ACTIONS][-1], @@ -195,17 +199,16 @@ def postprocess_ppo_gae(policy, def clip_gradients(policy, optimizer, loss): + variables = policy.model.trainable_variables() if policy.config["grad_clip"] is not None: - policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, - tf.get_variable_scope().name) - grads = tf.gradients(loss, policy.var_list) + grads_and_vars = optimizer.compute_gradients(loss, variables) + grads = [g for (g, v) in grads_and_vars] policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"]) - clipped_grads = list(zip(policy.grads, policy.var_list)) + clipped_grads = list(zip(policy.grads, variables)) return clipped_grads else: - return optimizer.compute_gradients( - loss, colocate_gradients_with_ops=True) + return optimizer.compute_gradients(loss, variables) class KLCoeffMixin(object): @@ -232,23 +235,27 @@ class KLCoeffMixin(object): class ValueNetworkMixin(object): def __init__(self, obs_space, action_space, config): if config["use_gae"]: - self.value_function = self.model.value_function() - else: - self.value_function = tf.zeros( - shape=tf.shape(self.get_placeholder(SampleBatch.CUR_OBS))[:1]) - def _value(self, ob, prev_action, prev_reward, *args): - feed_dict = { - self.get_placeholder(SampleBatch.CUR_OBS): [ob], - self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action], - self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward], - self.seq_lens: [1] - } - assert len(args) == len(self.state_in), (args, self.state_in) - for k, v in zip(self.state_in, args): - feed_dict[k] = v - vf = self.get_session().run(self.value_function, feed_dict) - return vf[0] + @make_tf_callable(self.get_session()) + def value(ob, prev_action, prev_reward, *state): + model_out, _ = self.model({ + SampleBatch.CUR_OBS: tf.convert_to_tensor([ob]), + SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( + [prev_action]), + SampleBatch.PREV_REWARDS: tf.convert_to_tensor( + [prev_reward]), + "is_training": tf.convert_to_tensor(False), + }, [tf.convert_to_tensor([s]) for s in state], + tf.convert_to_tensor([1])) + return self.model.value_function()[0] + + else: + + @make_tf_callable(self.get_session()) + def value(ob, prev_action, prev_reward, *state): + return tf.constant(0.0) + + self._value = value def setup_config(policy, obs_space, action_space, config): diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index eff9cfe88..41e2b94b6 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -53,7 +53,7 @@ DEFAULT_CONFIG = with_common_config({ # === Exploration === # Number of env steps to optimize for before returning - "timesteps_per_iteration": 1000, + "timesteps_per_iteration": 100, "exploration_enabled": True, # === Replay buffer === diff --git a/rllib/agents/sac/sac_model.py b/rllib/agents/sac/sac_model.py index 5cc7e6cff..c10d54b0c 100644 --- a/rllib/agents/sac/sac_model.py +++ b/rllib/agents/sac/sac_model.py @@ -157,12 +157,6 @@ class SACModel(TFModelV2): self.register_variables([self.log_alpha]) - def forward(self, input_dict, state, seq_lens): - """This generates the model_out tensor input. - - You must implement this as documented in modelv2.py.""" - raise NotImplementedError - def get_policy_output(self, model_out, deterministic=False): """Return the (unscaled) output of the policy network. diff --git a/rllib/agents/sac/sac_policy.py b/rllib/agents/sac/sac_policy.py index 6aadbbba9..8fb4e36e6 100644 --- a/rllib/agents/sac/sac_policy.py +++ b/rllib/agents/sac/sac_policy.py @@ -10,6 +10,8 @@ import ray import ray.experimental.tf_utils from ray.rllib.agents.sac.sac_model import SACModel from ray.rllib.agents.ddpg.noop_model import NoopModel +from ray.rllib.agents.ddpg.ddpg_policy import ComputeTDErrorMixin, \ + TargetNetworkMixin from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy @@ -87,12 +89,6 @@ def postprocess_trajectory(policy, return _postprocess_dqn(policy, sample_batch) -def exploration_setting_inputs(policy): - return { - policy.stochastic: policy.config["exploration_enabled"], - } - - def build_action_output(policy, model, input_dict, obs_space, action_space, config): model_out, _ = model({ @@ -129,37 +125,36 @@ def build_action_output(policy, model, input_dict, obs_space, action_space, return actions, action_probabilities -def actor_critic_loss(policy, batch_tensors): - model_out_t, _ = policy.model({ - "obs": batch_tensors[SampleBatch.CUR_OBS], +def actor_critic_loss(policy, model, _, train_batch): + model_out_t, _ = model({ + "obs": train_batch[SampleBatch.CUR_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) - model_out_tp1, _ = policy.model({ - "obs": batch_tensors[SampleBatch.NEXT_OBS], + model_out_tp1, _ = model({ + "obs": train_batch[SampleBatch.NEXT_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) target_model_out_tp1, _ = policy.target_model({ - "obs": batch_tensors[SampleBatch.NEXT_OBS], + "obs": train_batch[SampleBatch.NEXT_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) # TODO(hartikainen): figure actions and log pis - policy_t, log_pis_t = policy.model.get_policy_output(model_out_t) - policy_tp1, log_pis_tp1 = policy.model.get_policy_output(model_out_tp1) + policy_t, log_pis_t = model.get_policy_output(model_out_t) + policy_tp1, log_pis_tp1 = model.get_policy_output(model_out_tp1) - log_alpha = policy.model.log_alpha - alpha = policy.model.alpha + log_alpha = model.log_alpha + alpha = model.alpha # q network evaluation - q_t = policy.model.get_q_values(model_out_t, - batch_tensors[SampleBatch.ACTIONS]) + q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) if policy.config["twin_q"]: - twin_q_t = policy.model.get_twin_q_values( - model_out_t, batch_tensors[SampleBatch.ACTIONS]) + twin_q_t = model.get_twin_q_values(model_out_t, + train_batch[SampleBatch.ACTIONS]) # Q-values for current policy (no noise) in given current state - q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t) + q_t_det_policy = model.get_q_values(model_out_t, policy_t) # target q network evaluation q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1) @@ -175,14 +170,14 @@ def actor_critic_loss(policy, batch_tensors): q_tp1 -= tf.expand_dims(alpha * log_pis_t, 1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) - q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES], - tf.float32)) * q_tp1_best + q_tp1_best_masked = ( + 1.0 - tf.cast(train_batch[SampleBatch.DONES], tf.float32)) * q_tp1_best assert policy.config["n_step"] == 1, "TODO(hartikainen) n_step > 1" # compute RHS of bellman equation q_t_selected_target = tf.stop_gradient( - batch_tensors[SampleBatch.REWARDS] + + train_batch[SampleBatch.REWARDS] + policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked) # compute the error (potentially clipped) @@ -195,8 +190,8 @@ def actor_critic_loss(policy, batch_tensors): td_error = q_t_selected - q_t_selected_target errors = 0.5 * tf.square(td_error) - critic_loss = policy.model.custom_loss( - tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors) + critic_loss = model.custom_loss( + tf.reduce_mean(train_batch[PRIO_WEIGHTS] * errors), train_batch) actor_loss = tf.reduce_mean(alpha * log_pis_t - q_t_det_policy) target_entropy = (-np.prod(policy.action_space.shape) @@ -220,27 +215,27 @@ def actor_critic_loss(policy, batch_tensors): def gradients(policy, optimizer, loss): if policy.config["grad_norm_clipping"] is not None: actor_grads_and_vars = minimize_and_clip( - policy._actor_optimizer, + optimizer, policy.actor_loss, var_list=policy.model.policy_variables(), clip_val=policy.config["grad_norm_clipping"]) critic_grads_and_vars = minimize_and_clip( - policy._critic_optimizer, + optimizer, policy.critic_loss, var_list=policy.model.q_variables(), clip_val=policy.config["grad_norm_clipping"]) alpha_grads_and_vars = minimize_and_clip( - policy._alpha_optimizer, + optimizer, policy.alpha_loss, - var_list=policy.model.alpha, + var_list=[policy.model.log_alpha], clip_val=policy.config["grad_norm_clipping"]) else: - actor_grads_and_vars = policy._actor_optimizer.compute_gradients( + actor_grads_and_vars = optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) - critic_grads_and_vars = policy._critic_optimizer.compute_gradients( + critic_grads_and_vars = optimizer.compute_gradients( policy.critic_loss, var_list=policy.model.q_variables()) - alpha_grads_and_vars = policy._critic_optimizer.compute_gradients( - policy.alpha_loss, var_list=policy.model.alpha) + alpha_grads_and_vars = optimizer.compute_gradients( + policy.alpha_loss, var_list=[policy.model.log_alpha]) # save these for later use in build_apply_op policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars if g is not None] @@ -254,7 +249,7 @@ def gradients(policy, optimizer, loss): return grads_and_vars -def stats(policy, batch_tensors): +def stats(policy, train_batch): return { "td_error": tf.reduce_mean(policy.td_error), "actor_loss": tf.reduce_mean(policy.actor_loss), @@ -267,40 +262,17 @@ def stats(policy, batch_tensors): class ExplorationStateMixin(object): def __init__(self, obs_space, action_space, config): - self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") + self.stochastic = tf.get_variable( + initializer=tf.constant_initializer(config["exploration_enabled"]), + name="stochastic", + shape=(), + trainable=False, + dtype=tf.bool) def set_epsilon(self, epsilon): pass -class TargetNetworkMixin(object): - def __init__(self, config): - # update_target_fn will be called periodically to copy Q network to - # target Q network - self.tau_value = config.get("tau") - self.tau = tf.placeholder(tf.float32, (), name="tau") - update_target_expr = [] - model_vars = self.model.trainable_variables() - target_model_vars = self.target_model.trainable_variables() - assert len(model_vars) == len(target_model_vars), \ - (model_vars, target_model_vars) - for var, var_target in zip(model_vars, target_model_vars): - update_target_expr.append( - var_target.assign(self.tau * var + - (1.0 - self.tau) * var_target)) - logger.debug("Update target op {}".format(var_target)) - self.update_target_expr = tf.group(*update_target_expr) - - # Hard initial update - self.update_target(tau=1.0) - - # support both hard and soft sync - def update_target(self, tau=None): - tau = tau or self.tau_value - return self.get_session().run( - self.update_target_expr, feed_dict={self.tau: tau}) - - class ActorCriticOptimizerMixin(object): def __init__(self, config): # create global step for counting the number of update operations @@ -315,34 +287,15 @@ class ActorCriticOptimizerMixin(object): learning_rate=config["optimization"]["entropy_learning_rate"]) -class ComputeTDErrorMixin(object): - def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, - importance_weights): - if not self.loss_initialized(): - return np.zeros_like(rew_t) - - td_err = self.get_session().run( - self.td_error, - feed_dict={ - self.get_placeholder(SampleBatch.CUR_OBS): [ - np.array(ob) for ob in obs_t - ], - self.get_placeholder(SampleBatch.ACTIONS): act_t, - self.get_placeholder(SampleBatch.REWARDS): rew_t, - self.get_placeholder(SampleBatch.NEXT_OBS): [ - np.array(ob) for ob in obs_tp1 - ], - self.get_placeholder(SampleBatch.DONES): done_mask, - self.get_placeholder(PRIO_WEIGHTS): importance_weights - }) - return td_err - - def setup_early_mixins(policy, obs_space, action_space, config): ExplorationStateMixin.__init__(policy, obs_space, action_space, config) ActorCriticOptimizerMixin.__init__(policy, config) +def setup_mid_mixins(policy, obs_space, action_space, config): + ComputeTDErrorMixin.__init__(policy) + + def setup_late_mixins(policy, obs_space, action_space, config): TargetNetworkMixin.__init__(policy, config) @@ -352,7 +305,6 @@ SACTFPolicy = build_tf_policy( get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG, make_model=build_sac_model, postprocess_fn=postprocess_trajectory, - extra_action_feed_fn=exploration_setting_inputs, action_sampler_fn=build_action_output, loss_fn=actor_critic_loss, stats_fn=stats, @@ -363,5 +315,6 @@ SACTFPolicy = build_tf_policy( ComputeTDErrorMixin ], before_init=setup_early_mixins, + before_loss_init=setup_mid_mixins, after_init=setup_late_mixins, obs_include_prev_action_reward=False) diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 78d4cdf6e..fd4320e91 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -70,6 +70,8 @@ COMMON_CONFIG = { "ignore_worker_failures": False, # Log system resource metrics to results. "log_sys_usage": True, + # Enable TF eager execution (TF policies only) + "eager": False, # === Policy === # Arguments to pass to model. See models/catalog.py for a full list of the @@ -326,6 +328,14 @@ class Trainer(Trainable): config = config or {} + if tf and config.get("eager"): + tf.enable_eager_execution() + logger.info("Executing eagerly") + + if tf and not tf.executing_eagerly(): + logger.info("Tip: set 'eager': true or the --eager flag to enable " + "TensorFlow eager execution") + # Vars to synchronize to workers on each train call self.global_vars = {"timestep": 0} @@ -464,7 +474,7 @@ class Trainer(Trainable): logging.getLogger("ray.rllib").setLevel(self.config["log_level"]) def get_scope(): - if tf: + if tf and not tf.executing_eagerly(): return tf.Graph().as_default() else: return open("/dev/null") # fake a no-op scope diff --git a/rllib/agents/trainer_template.py b/rllib/agents/trainer_template.py index 70bdf254d..dae1de45e 100644 --- a/rllib/agents/trainer_template.py +++ b/rllib/agents/trainer_template.py @@ -8,6 +8,9 @@ from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils import add_mixins from ray.rllib.utils.annotations import override, DeveloperAPI +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() @DeveloperAPI diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index dfa87773f..870a4309e 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -238,6 +238,10 @@ class RolloutWorker(EvaluatorInterface): global _global_worker _global_worker = self + policy_config = policy_config or {} + if tf and policy_config.get("eager"): + tf.enable_eager_execution() + if log_level: logging.getLogger("ray.rllib").setLevel(log_level) @@ -247,7 +251,6 @@ class RolloutWorker(EvaluatorInterface): enable_periodic_logging() env_context = EnvContext(env_config or {}, worker_index) - policy_config = policy_config or {} self.policy_config = policy_config self.callbacks = callbacks or {} self.worker_index = worker_index @@ -322,7 +325,8 @@ class RolloutWorker(EvaluatorInterface): torch.manual_seed(seed) except ImportError: logger.info("Could not seed torch") - if _has_tensorflow_graph(policy_dict): + if _has_tensorflow_graph(policy_dict) and not (tf and + tf.executing_eagerly()): if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE and not ray.get_gpu_ids()): @@ -608,7 +612,7 @@ class RolloutWorker(EvaluatorInterface): info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch( samples) if log_once("learn_out"): - logger.info("Training output:\n\n{}\n".format(summarize(info_out))) + logger.debug("Training out:\n\n{}\n".format(summarize(info_out))) return info_out @DeveloperAPI @@ -747,6 +751,14 @@ class RolloutWorker(EvaluatorInterface): "Found raw Tuple|Dict space as input to policy. " "Please preprocess these observations with a " "Tuple|DictFlatteningPreprocessor.") + if tf and tf.executing_eagerly(): + if hasattr(cls, "as_eager"): + cls = cls.as_eager() + elif not issubclass(cls, TFPolicy): + pass # could be some other type of policy + else: + raise ValueError("This policy does not support eager " + "execution: {}".format(cls)) if tf: with tf.variable_scope(name): policy_map[name] = cls(obs_space, act_space, merged_conf) diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 9d07df5af..b26e11762 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -78,6 +78,9 @@ class CentralizedCriticModel(TFModelV2): [obs, opponent_obs, tf.one_hot(opponent_actions, 2)]), [-1]) + def value_function(self): + return self.model.value_function() # not used + class CentralizedValueMixin(object): """Add methods to evaluate the central value function from the model.""" @@ -97,7 +100,7 @@ class CentralizedValueMixin(object): return self.get_session().run(self.central_value_function, feed_dict) -# Grabs the opponent obs/act and includes it in the experience batch, +# Grabs the opponent obs/act and includes it in the experience train_batch, # and computes GAE using the central vf predictions. def centralized_critic_postprocessing(policy, sample_batch, @@ -105,7 +108,7 @@ def centralized_critic_postprocessing(policy, episode=None): if policy.loss_initialized(): assert sample_batch["dones"][-1], \ - "Not implemented for batch_mode=truncate_episodes" + "Not implemented for train_batch_mode=truncate_episodes" assert other_agent_batches is not None [(_, opponent_batch)] = list(other_agent_batches.values()) @@ -126,33 +129,36 @@ def centralized_critic_postprocessing(policy, sample_batch[SampleBatch.VF_PREDS] = np.zeros_like( sample_batch[SampleBatch.ACTIONS], dtype=np.float32) - batch = compute_advantages( + train_batch = compute_advantages( sample_batch, 0.0, policy.config["gamma"], policy.config["lambda"], use_gae=policy.config["use_gae"]) - return batch + return train_batch # Copied from PPO but optimizing the central value function -def loss_with_central_critic(policy, batch_tensors): +def loss_with_central_critic(policy, model, dist_class, train_batch): CentralizedValueMixin.__init__(policy) + logits, state = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + policy.loss_obj = PPOLoss( policy.action_space, - policy.dist_class, - policy.model, - batch_tensors[Postprocessing.VALUE_TARGETS], - batch_tensors[Postprocessing.ADVANTAGES], - batch_tensors[SampleBatch.ACTIONS], - batch_tensors[BEHAVIOUR_LOGITS], - batch_tensors[ACTION_LOGP], - batch_tensors[SampleBatch.VF_PREDS], - policy.action_dist, + dist_class, + model, + train_batch[Postprocessing.VALUE_TARGETS], + train_batch[Postprocessing.ADVANTAGES], + train_batch[SampleBatch.ACTIONS], + train_batch[BEHAVIOUR_LOGITS], + train_batch[ACTION_LOGP], + train_batch[SampleBatch.VF_PREDS], + action_dist, policy.central_value_function, policy.kl_coeff, - tf.ones_like(batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool), + tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool), entropy_coeff=policy.entropy_coeff, clip_param=policy.config["clip_param"], vf_clip_param=policy.config["vf_clip_param"], @@ -174,11 +180,11 @@ def setup_mixins(policy, obs_space, action_space, config): tf.shape(policy.get_placeholder(SampleBatch.CUR_OBS))[0]) -def central_vf_stats(policy, batch_tensors, grads): +def central_vf_stats(policy, train_batch, grads): # Report the explained variance of the central value function. return { "vf_explained_var": explained_variance( - batch_tensors[Postprocessing.VALUE_TARGETS], + train_batch[Postprocessing.VALUE_TARGETS], policy.central_value_function), } diff --git a/rllib/examples/custom_tf_policy.py b/rllib/examples/custom_tf_policy.py index 0442dff83..a3e5698e9 100644 --- a/rllib/examples/custom_tf_policy.py +++ b/rllib/examples/custom_tf_policy.py @@ -7,7 +7,6 @@ import argparse import ray from ray import tune from ray.rllib.agents.trainer_template import build_trainer -from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils import try_import_tf @@ -17,10 +16,11 @@ parser = argparse.ArgumentParser() parser.add_argument("--iters", type=int, default=200) -def policy_gradient_loss(policy, batch_tensors): - actions = batch_tensors[SampleBatch.ACTIONS] - rewards = batch_tensors[SampleBatch.REWARDS] - return -tf.reduce_mean(policy.action_dist.logp(actions) * rewards) +def policy_gradient_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + return -tf.reduce_mean( + action_dist.logp(train_batch["actions"]) * train_batch["rewards"]) # diff --git a/rllib/examples/custom_torch_policy.py b/rllib/examples/custom_torch_policy.py index 4fdb3a064..7be747440 100644 --- a/rllib/examples/custom_torch_policy.py +++ b/rllib/examples/custom_torch_policy.py @@ -14,13 +14,11 @@ parser = argparse.ArgumentParser() parser.add_argument("--iters", type=int, default=200) -def policy_gradient_loss(policy, batch_tensors): - logits, _ = policy.model({ - SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS] - }) - action_dist = policy.dist_class(logits, policy.model) - log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS]) - return -batch_tensors[SampleBatch.REWARDS].dot(log_probs) +def policy_gradient_loss(policy, model, dist_class, train_batch): + logits, _ = model({SampleBatch.CUR_OBS: train_batch[SampleBatch.CUR_OBS]}) + action_dist = dist_class(logits, model) + log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) + return -train_batch[SampleBatch.REWARDS].dot(log_probs) # diff --git a/rllib/examples/eager_execution.py b/rllib/examples/eager_execution.py index 710ea2bb5..fe2305207 100644 --- a/rllib/examples/eager_execution.py +++ b/rllib/examples/eager_execution.py @@ -47,7 +47,7 @@ class EagerModel(Model): return feature_layer -def policy_gradient_loss(policy, batch_tensors): +def policy_gradient_loss(policy, model, dist_class, train_batch): """Example of using embedded eager execution in a custom loss. Here `compute_penalty` prints the actions and rewards for debugging, and @@ -61,12 +61,15 @@ def policy_gradient_loss(policy, batch_tensors): print("The eagerly computed penalty is", penalty, actions, rewards) return penalty - actions = batch_tensors[SampleBatch.ACTIONS] - rewards = batch_tensors[SampleBatch.REWARDS] + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + + actions = train_batch[SampleBatch.ACTIONS] + rewards = train_batch[SampleBatch.REWARDS] penalty = tf.py_function( compute_penalty, [actions, rewards], Tout=tf.float32) - return penalty - tf.reduce_mean(policy.action_dist.logp(actions) * rewards) + return penalty - tf.reduce_mean(action_dist.logp(actions) * rewards) # diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py index 634f5ea1d..4a9feb1b7 100644 --- a/rllib/examples/rock_paper_scissors_multiagent.py +++ b/rllib/examples/rock_paper_scissors_multiagent.py @@ -7,7 +7,7 @@ This demonstrates running the following policies in competition: (1) heuristic policy of repeating the same move (2) heuristic policy of beating the last opponent move (3) LSTM/feedforward PG policies - (4) LSTM policy with custom safety loss + (4) LSTM policy with custom entropy loss """ import random @@ -195,11 +195,12 @@ def run_with_custom_entropy_loss(): This performs about the same as the default loss does.""" - def entropy_policy_gradient_loss(policy, batch_tensors): - actions = batch_tensors["actions"] - advantages = batch_tensors["advantages"] - return (-0.1 * policy.action_dist.entropy() - tf.reduce_mean( - policy.action_dist.logp(actions) * advantages)) + def entropy_policy_gradient_loss(policy, model, dist_class, train_batch): + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + return (-0.1 * action_dist.entropy() - tf.reduce_mean( + action_dist.logp(train_batch["actions"]) * + train_batch["advantages"])) EntropyPolicy = PGTFPolicy.with_updates( loss_fn=entropy_policy_gradient_loss) diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py index 33b6b36cb..5cf40408e 100644 --- a/rllib/models/catalog.py +++ b/rllib/models/catalog.py @@ -13,6 +13,8 @@ from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \ from ray.rllib.models.extra_spaces import Simplex from ray.rllib.models.torch.torch_action_dist import (TorchCategorical, TorchDiagGaussian) +from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork as FCNetV2 +from ray.rllib.models.tf.visionnet_v2 import VisionNetwork as VisionNetV2 from ray.rllib.models.tf.tf_action_dist import ( Categorical, MultiCategorical, Deterministic, DiagGaussian, MultiActionDistribution, Dirichlet) @@ -176,25 +178,22 @@ class ModelCatalog(object): @staticmethod @DeveloperAPI - def get_action_placeholder(action_space): - """Returns an action placeholder that is consistent with the action space + def get_action_shape(action_space): + """Returns action tensor dtype and shape for the action space. Args: action_space (Space): Action space of the target gym env. Returns: - action_placeholder (Tensor): A placeholder for the actions + (dtype, shape): Dtype and shape of the actions tensor. """ if isinstance(action_space, gym.spaces.Discrete): - return tf.placeholder(tf.int64, shape=(None, ), name="action") + return (tf.int64, (None, )) elif isinstance(action_space, (gym.spaces.Box, Simplex)): - return tf.placeholder( - tf.float32, shape=(None, ) + action_space.shape, name="action") + return (tf.float32, (None, ) + action_space.shape) elif isinstance(action_space, gym.spaces.MultiDiscrete): - return tf.placeholder( - tf.as_dtype(action_space.dtype), - shape=(None, ) + action_space.shape, - name="action") + return (tf.as_dtype(action_space.dtype), + (None, ) + action_space.shape) elif isinstance(action_space, gym.spaces.Tuple): size = 0 all_discrete = True @@ -204,14 +203,26 @@ class ModelCatalog(object): else: all_discrete = False size += np.product(action_space.spaces[i].shape) - return tf.placeholder( - tf.int64 if all_discrete else tf.float32, - shape=(None, size), - name="action") + return (tf.int64 if all_discrete else tf.float32, (None, size)) else: raise NotImplementedError("action space {}" " not supported".format(action_space)) + @staticmethod + @DeveloperAPI + def get_action_placeholder(action_space): + """Returns an action placeholder consistent with the action space + + Args: + action_space (Space): Action space of the target gym env. + Returns: + action_placeholder (Tensor): A placeholder for the actions + """ + + dtype, shape = ModelCatalog.get_action_shape(action_space) + + return tf.placeholder(dtype, shape=shape, name="action") + @staticmethod @DeveloperAPI def get_model_v2(obs_space, @@ -282,11 +293,29 @@ class ModelCatalog(object): instance = model_cls(obs_space, action_space, num_outputs, model_config, name, **model_kwargs) return instance + elif tf.executing_eagerly(): + raise ValueError( + "Eager execution requires a TFModelV2 model to be " + "used, however you specified a custom model {}".format( + model_cls)) if framework == "tf": - legacy_model_cls = default_model or ModelCatalog.get_model - wrapper = ModelCatalog._wrap_if_needed( - make_v1_wrapper(legacy_model_cls), model_interface) + v2_class = None + # try to get a default v2 model + if not model_config.get("custom_model"): + v2_class = default_model or ModelCatalog._get_v2_model( + obs_space, model_config) + # fallback to a default v1 model + if v2_class is None: + if tf.executing_eagerly(): + raise ValueError( + "Eager execution requires a TFModelV2 model to be " + "used, however there is no default V2 model for this " + "observation space: {}, use_lstm={}".format( + obs_space, model_config.get("use_lstm"))) + v2_class = make_v1_wrapper(ModelCatalog.get_model) + # wrap in the requested interface + wrapper = ModelCatalog._wrap_if_needed(v2_class, model_interface) return wrapper(obs_space, action_space, num_outputs, model_config, name, **model_kwargs) elif framework == "torch": @@ -388,7 +417,7 @@ class ModelCatalog(object): @staticmethod def _wrap_if_needed(model_cls, model_interface): - assert issubclass(model_cls, TFModelV2) + assert issubclass(model_cls, TFModelV2), model_cls if not model_interface or issubclass(model_cls, model_interface): return model_cls @@ -484,6 +513,19 @@ class ModelCatalog(object): return FullyConnectedNetwork(input_dict, obs_space, action_space, num_outputs, options) + @staticmethod + def _get_v2_model(obs_space, options): + options = options or MODEL_DEFAULTS + obs_rank = len(obs_space.shape) - 1 + + if options.get("use_lstm"): + return None # TODO: default LSTM v2 not implemented + + if obs_rank > 1: + return VisionNetV2 + + return FCNetV2 + @staticmethod def get_torch_model(obs_space, num_outputs, diff --git a/rllib/models/modelv2.py b/rllib/models/modelv2.py index f04bb4574..2c072a2f6 100644 --- a/rllib/models/modelv2.py +++ b/rllib/models/modelv2.py @@ -2,6 +2,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.models.model import restore_original_dimensions from ray.rllib.utils.annotations import PublicAPI @@ -41,6 +42,7 @@ class ModelV2(object): self.model_config = model_config self.name = name or "default_model" self.framework = framework + self._last_output = None def get_initial_state(self): """Get the initial recurrent state values for the model. @@ -165,4 +167,31 @@ class ModelV2(object): if not isinstance(state, list): raise ValueError("State output is not a list: {}".format(state)) + self._last_output = outputs return outputs, state + + def from_batch(self, train_batch, is_training=True): + """Convenience function that calls this model with a tensor batch. + + All this does is unpack the tensor batch to call this model with the + right input dict, state, and seq len arguments. + """ + + input_dict = { + "obs": train_batch[SampleBatch.CUR_OBS], + "is_training": is_training, + } + if SampleBatch.PREV_ACTIONS in train_batch: + input_dict["prev_actions"] = train_batch[SampleBatch.PREV_ACTIONS] + if SampleBatch.PREV_REWARDS in train_batch: + input_dict["prev_rewards"] = train_batch[SampleBatch.PREV_REWARDS] + states = [] + i = 0 + while "state_in_{}".format(i) in train_batch: + states.append(train_batch["state_in_{}".format(i)]) + i += 1 + return self.__call__(input_dict, states, train_batch.get("seq_lens")) + + def last_output(self): + """Returns the last output returned from calling the model.""" + return self._last_output diff --git a/rllib/models/tf/fcnet_v2.py b/rllib/models/tf/fcnet_v2.py index 2231b45a9..1201fa858 100644 --- a/rllib/models/tf/fcnet_v2.py +++ b/rllib/models/tf/fcnet_v2.py @@ -10,9 +10,7 @@ tf = try_import_tf() class FullyConnectedNetwork(TFModelV2): - """Generic fully connected network implemented in ModelV2 API. - - TODO(ekl): should make this the default fcnet in the future.""" + """Generic fully connected network implemented in ModelV2 API.""" def __init__(self, obs_space, action_space, num_outputs, model_config, name): @@ -65,7 +63,7 @@ class FullyConnectedNetwork(TFModelV2): for size in hiddens: last_layer = tf.keras.layers.Dense( size, - name="value_fc_{}".format(i), + name="fc_value_{}".format(i), activation=activation, kernel_initializer=normc_initializer(1.0))(last_layer) i += 1 diff --git a/rllib/models/tf/modelv1_compat.py b/rllib/models/tf/modelv1_compat.py index e5c70fc1f..a2b1b54ab 100644 --- a/rllib/models/tf/modelv1_compat.py +++ b/rllib/models/tf/modelv1_compat.py @@ -10,6 +10,7 @@ from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.models.tf.misc import linear, normc_initializer from ray.rllib.utils.annotations import override from ray.rllib.utils import try_import_tf +from ray.rllib.utils.debug import log_once from ray.rllib.utils.tf_ops import scope_vars tf = try_import_tf() @@ -47,9 +48,6 @@ def make_v1_wrapper(legacy_model_cls): else: self.initial_state = [] - # Tracks branches created so far - self.branches_created = set() - # Tracks update ops self._update_ops = None @@ -118,42 +116,40 @@ def make_v1_wrapper(legacy_model_cls): def value_function(self): assert self.cur_instance, "must call forward first" - with self._branch_variable_scope("value_function"): - # Simple case: sharing the feature layer - if self.model_config["vf_share_layers"]: - return tf.reshape( - linear(self.cur_instance.last_layer, 1, - "value_function", normc_initializer(1.0)), [-1]) - - # Create a new separate model with no RNN state, etc. - branch_model_config = self.model_config.copy() - branch_model_config["free_log_std"] = False - if branch_model_config["use_lstm"]: - branch_model_config["use_lstm"] = False - logger.warning( - "It is not recommended to use a LSTM model with " - "vf_share_layers=False (consider setting it to True). " - "If you want to not share layers, you can implement " - "a custom LSTM model that overrides the " - "value_function() method.") - branch_instance = self.legacy_model_cls( - self.cur_instance.input_dict, - self.obs_space, - self.action_space, - 1, - branch_model_config, - state_in=None, - seq_lens=None) - return tf.reshape(branch_instance.outputs, [-1]) - - def _branch_variable_scope(self, branch_type): - if branch_type in self.branches_created: - reuse = True - else: - self.branches_created.add(branch_type) - reuse = tf.AUTO_REUSE - with tf.variable_scope(self.variable_scope): - return tf.variable_scope(branch_type, reuse=reuse) + with tf.variable_scope("value_function", reuse=tf.AUTO_REUSE): + # Simple case: sharing the feature layer + if self.model_config["vf_share_layers"]: + return tf.reshape( + linear(self.cur_instance.last_layer, 1, + "value_function", normc_initializer(1.0)), + [-1]) + + # Create a new separate model with no RNN state, etc. + branch_model_config = self.model_config.copy() + branch_model_config["free_log_std"] = False + if branch_model_config["use_lstm"]: + branch_model_config["use_lstm"] = False + if log_once("vf_warn"): + logger.warning( + "It is not recommended to use a LSTM model " + "with vf_share_layers=False (consider setting " + "it to True). If you want to not share " + "layers, you can implement a custom LSTM " + "model that overrides the value_function() " + "method.") + branch_instance = self.legacy_model_cls( + self.cur_instance.input_dict, + self.obs_space, + self.action_space, + 1, + branch_model_config, + state_in=None, + seq_lens=None) + return tf.reshape(branch_instance.outputs, [-1]) + + @override(ModelV2) + def last_output(self): + return self.cur_instance.outputs return ModelV1Wrapper diff --git a/rllib/models/tf/visionnet_v1.py b/rllib/models/tf/visionnet_v1.py index 72a947028..8eca33950 100644 --- a/rllib/models/tf/visionnet_v1.py +++ b/rllib/models/tf/visionnet_v1.py @@ -10,7 +10,7 @@ from ray.rllib.utils import try_import_tf tf = try_import_tf() -# TODO(ekl) rewrite this using ModelV2 +# Deprecated: see as an alternative models/tf/visionnet_v2.py class VisionNetwork(Model): """Generic vision network.""" diff --git a/rllib/models/tf/visionnet_v2.py b/rllib/models/tf/visionnet_v2.py new file mode 100644 index 000000000..730cbab54 --- /dev/null +++ b/rllib/models/tf/visionnet_v2.py @@ -0,0 +1,110 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.visionnet_v1 import _get_filter_config +from ray.rllib.models.tf.misc import normc_initializer, get_activation_fn +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + + +class VisionNetwork(TFModelV2): + """Generic vision network implemented in ModelV2 API.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + super(VisionNetwork, self).__init__(obs_space, action_space, + num_outputs, model_config, name) + + activation = get_activation_fn(model_config.get("conv_activation")) + filters = model_config.get("conv_filters") + if not filters: + filters = _get_filter_config(obs_space.shape) + no_final_linear = model_config.get("no_final_linear") + vf_share_layers = model_config.get("vf_share_layers") + + inputs = tf.keras.layers.Input( + shape=obs_space.shape, name="observations") + last_layer = inputs + + # Build the action layers + for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1): + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=(stride, stride), + activation=activation, + padding="same", + name="conv{}".format(i))(last_layer) + out_size, kernel, stride = filters[-1] + if no_final_linear: + # the last layer is adjusted to be of size num_outputs + last_layer = tf.keras.layers.Conv2D( + num_outputs, + kernel, + strides=(stride, stride), + activation=activation, + padding="valid", + name="conv_out")(last_layer) + conv_out = last_layer + else: + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=(stride, stride), + activation=activation, + padding="valid", + name="conv{}".format(i + 1))(last_layer) + conv_out = tf.keras.layers.Conv2D( + num_outputs, [1, 1], + activation=None, + padding="same", + name="conv_out")(last_layer) + + # Build the value layers + if vf_share_layers: + last_layer = tf.squeeze(last_layer, axis=[1, 2]) + value_out = tf.keras.layers.Dense( + 1, + name="value_out", + activation=None, + kernel_initializer=normc_initializer(0.01))(last_layer) + else: + # build a parallel set of hidden layers for the value net + last_layer = inputs + for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1): + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=(stride, stride), + activation=activation, + padding="same", + name="conv_value_{}".format(i))(last_layer) + out_size, kernel, stride = filters[-1] + last_layer = tf.keras.layers.Conv2D( + out_size, + kernel, + strides=(stride, stride), + activation=activation, + padding="valid", + name="conv_value_{}".format(i + 1))(last_layer) + last_layer = tf.keras.layers.Conv2D( + 1, [1, 1], + activation=None, + padding="same", + name="conv_value_out")(last_layer) + value_out = tf.squeeze(last_layer, axis=[1, 2]) + + self.base_model = tf.keras.Model(inputs, [conv_out, value_out]) + self.register_variables(self.base_model.variables) + + def forward(self, input_dict, state, seq_lens): + # explicit cast to float32 needed in eager + model_out, self._value_out = self.base_model( + tf.cast(input_dict["obs"], tf.float32)) + return tf.squeeze(model_out, axis=[1, 2]), state + + def value_function(self): + return tf.reshape(self._value_out, [-1]) diff --git a/rllib/optimizers/multi_gpu_optimizer.py b/rllib/optimizers/multi_gpu_optimizer.py index 4d10b5027..3f6e19e5e 100644 --- a/rllib/optimizers/multi_gpu_optimizer.py +++ b/rllib/optimizers/multi_gpu_optimizer.py @@ -98,8 +98,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): for policy_id, policy in self.policies.items(): if not isinstance(policy, TFPolicy): raise ValueError( - "Only TF policies are supported with multi-GPU. Try using " - "the simple optimizer instead.") + "Only TF graph policies are supported with multi-GPU. " + "Try setting `simple_optimizer=True` instead.") # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after diff --git a/rllib/optimizers/sync_samples_optimizer.py b/rllib/optimizers/sync_samples_optimizer.py index 0f79062a3..1679e8c2c 100644 --- a/rllib/optimizers/sync_samples_optimizer.py +++ b/rllib/optimizers/sync_samples_optimizer.py @@ -2,11 +2,13 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import ray import logging +import random + +import ray from ray.rllib.evaluation.metrics import get_learner_stats from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer -from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.sample_batch import SampleBatch, MultiAgentBatch from ray.rllib.utils.annotations import override from ray.rllib.utils.filter import RunningStat from ray.rllib.utils.timer import TimerStat @@ -23,7 +25,11 @@ class SyncSamplesOptimizer(PolicyOptimizer): model weights are then broadcast to all remote workers. """ - def __init__(self, workers, num_sgd_iter=1, train_batch_size=1): + def __init__(self, + workers, + num_sgd_iter=1, + train_batch_size=1, + sgd_minibatch_size=0): PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() @@ -31,6 +37,7 @@ class SyncSamplesOptimizer(PolicyOptimizer): self.grad_timer = TimerStat() self.throughput = RunningStat() self.num_sgd_iter = num_sgd_iter + self.sgd_minibatch_size = sgd_minibatch_size self.train_batch_size = train_batch_size self.learner_stats = {} @@ -58,7 +65,9 @@ class SyncSamplesOptimizer(PolicyOptimizer): with self.grad_timer: for i in range(self.num_sgd_iter): - fetches = self.workers.local_worker().learn_on_batch(samples) + for minibatch in self._minibatches(samples): + fetches = self.workers.local_worker().learn_on_batch( + minibatch) self.learner_stats = get_learner_stats(fetches) if self.num_sgd_iter > 1: logger.debug("{} {}".format(i, fetches)) @@ -83,3 +92,27 @@ class SyncSamplesOptimizer(PolicyOptimizer): "opt_samples": round(self.grad_timer.mean_units_processed, 3), "learner": self.learner_stats, }) + + def _minibatches(self, samples): + if not self.sgd_minibatch_size: + yield samples + return + + if isinstance(samples, MultiAgentBatch): + raise NotImplementedError( + "Minibatching not implemented for multi-agent in simple mode") + + if "state_in_0" in samples.data: + logger.warn("Not shuffling RNN data for SGD in simple mode") + else: + samples.shuffle() + + i = 0 + slices = [] + while i < samples.count: + slices.append((i, i + self.sgd_minibatch_size)) + i += self.sgd_minibatch_size + random.shuffle(slices) + + for i, j in slices: + yield samples.slice(i, j) diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index f47cfc1d0..1e1e65399 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -82,11 +82,6 @@ class DynamicTFPolicy(TFPolicy): Attributes: config: config of the policy model: model instance, if any - model_out: output tensors of the model - action_dist: action distribution of the model, if any - state_in: state input tensors, if any - state_out: state output tensors, if any - seq_lens: tensor of sequence lengths """ self.config = config self._loss_fn = loss_fn @@ -113,13 +108,13 @@ class DynamicTFPolicy(TFPolicy): prev_rewards = tf.placeholder( tf.float32, [None], name="prev_reward") - self.input_dict = { + self._input_dict = { SampleBatch.CUR_OBS: obs, SampleBatch.PREV_ACTIONS: prev_actions, SampleBatch.PREV_REWARDS: prev_rewards, "is_training": self._get_is_training_placeholder(), } - self.seq_lens = tf.placeholder( + self._seq_lens = tf.placeholder( dtype=tf.int32, shape=[None], name="seq_lens") # Setup model @@ -127,11 +122,10 @@ class DynamicTFPolicy(TFPolicy): if not make_model: raise ValueError( "make_model is required if action_sampler_fn is given") - self.dist_class = None + self._dist_class = None else: - self.dist_class, logit_dim = ModelCatalog.get_action_dist( + self._dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) - self.logit_dim = logit_dim if existing_model: self.model = existing_model @@ -146,30 +140,30 @@ class DynamicTFPolicy(TFPolicy): framework="tf") if existing_inputs: - self.state_in = [ + self._state_in = [ v for k, v in existing_inputs.items() if k.startswith("state_in_") ] - if self.state_in: - self.seq_lens = existing_inputs["seq_lens"] + if self._state_in: + self._seq_lens = existing_inputs["seq_lens"] else: - self.state_in = [ + self._state_in = [ tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype) for s in self.model.get_initial_state() ] - self.model_out, self.state_out = self.model( - self.input_dict, self.state_in, self.seq_lens) + + model_out, self._state_out = self.model(self._input_dict, + self._state_in, self._seq_lens) # Setup action sampler if action_sampler_fn: - self.action_dist = None action_sampler, action_logp = action_sampler_fn( - self, self.model, self.input_dict, obs_space, action_space, + self, self.model, self._input_dict, obs_space, action_space, config) else: - self.action_dist = self.dist_class(self.model_out, self.model) - action_sampler = self.action_dist.sample() - action_logp = self.action_dist.sampled_action_logp() + action_dist = self._dist_class(model_out, self.model) + action_sampler = action_dist.sample() + action_logp = action_dist.sampled_action_logp() # Phase 1 init sess = tf.get_default_session() or tf.Session() @@ -188,11 +182,11 @@ class DynamicTFPolicy(TFPolicy): loss=None, # dynamically initialized on run loss_inputs=[], model=self.model, - state_inputs=self.state_in, - state_outputs=self.state_out, + state_inputs=self._state_in, + state_outputs=self._state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, - seq_lens=self.seq_lens, + seq_lens=self._seq_lens, max_seq_len=config["model"]["max_seq_len"], batch_divisibility_req=batch_divisibility_req) @@ -201,13 +195,6 @@ class DynamicTFPolicy(TFPolicy): if not existing_inputs: self._initialize_loss() - def get_obs_input_dict(self): - """Returns the obs input dict used to build policy models. - - This dict includes the obs, prev actions, prev rewards, etc. tensors. - """ - return self.input_dict - @override(TFPolicy) def copy(self, existing_inputs): """Creates a copy of self using existing input placeholders.""" @@ -260,6 +247,12 @@ class DynamicTFPolicy(TFPolicy): else: return [] + def is_recurrent(self): + return len(self._state_in) > 0 + + def num_state_tensors(self): + return len(self._state_in) + def _initialize_loss(self): def fake_array(tensor): shape = tensor.shape.as_list() @@ -280,9 +273,11 @@ class DynamicTFPolicy(TFPolicy): SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input), }) state_init = self.get_initial_state() + state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.expand_dims(h, 0) dummy_batch["state_out_{}".format(i)] = np.expand_dims(h, 0) + state_batches.append(np.expand_dims(h, 0)) if state_init: dummy_batch["seq_lens"] = np.array([1], dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): @@ -290,11 +285,16 @@ class DynamicTFPolicy(TFPolicy): # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) + postprocessed_batch = self.postprocess_trajectory( SampleBatch(dummy_batch)) + # model forward pass for the loss (needed after postprocess to + # overwrite any tensor state from that call) + self.model(self._input_dict, self._state_in, self._seq_lens) + if self._obs_include_prev_action_reward: - batch_tensors = UsageTrackingDict({ + train_batch = UsageTrackingDict({ SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, @@ -305,7 +305,7 @@ class DynamicTFPolicy(TFPolicy): (SampleBatch.CUR_OBS, self._obs_input), ] else: - batch_tensors = UsageTrackingDict({ + train_batch = UsageTrackingDict({ SampleBatch.CUR_OBS: self._obs_input, }) loss_inputs = [ @@ -313,35 +313,42 @@ class DynamicTFPolicy(TFPolicy): ] for k, v in postprocessed_batch.items(): - if k in batch_tensors: + if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF + elif k == "seq_lens" or k.startswith("state_in_"): + continue shape = (None, ) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) - batch_tensors[k] = placeholder + train_batch[k] = placeholder + + for i, si in enumerate(self._state_in): + train_batch["state_in_{}".format(i)] = si + train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( - summarize(batch_tensors))) + summarize(train_batch))) - self._loss_input_dict = batch_tensors - loss = self._do_loss_init(batch_tensors) - for k in sorted(batch_tensors.accessed_keys): - loss_inputs.append((k, batch_tensors[k])) + self._loss_input_dict = train_batch + loss = self._do_loss_init(train_batch) + for k in sorted(train_batch.accessed_keys): + if k != "seq_lens" and not k.startswith("state_in_"): + loss_inputs.append((k, train_batch[k])) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update( - self._grad_stats_fn(self, batch_tensors, self._grads)) + self._grad_stats_fn(self, train_batch, self._grads)) self._sess.run(tf.global_variables_initializer()) - def _do_loss_init(self, batch_tensors): - loss = self._loss_fn(self, batch_tensors) + def _do_loss_init(self, train_batch): + loss = self._loss_fn(self, self.model, self._dist_class, train_batch) if self._stats_fn: - self._stats_fetches.update(self._stats_fn(self, batch_tensors)) + self._stats_fetches.update(self._stats_fn(self, train_batch)) # override the update ops to be those of the model self._update_ops = self.model.update_ops() return loss diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py new file mode 100644 index 000000000..b69e3f0a8 --- /dev/null +++ b/rllib/policy/eager_tf_policy.py @@ -0,0 +1,375 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +import numpy as np + +from ray.rllib.evaluation.episode import _flatten_action +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_policy import ACTION_PROB, ACTION_LOGP +from ray.rllib.utils import add_mixins +from ray.rllib.utils.annotations import override +from ray.rllib.utils.debug import log_once +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() +logger = logging.getLogger(__name__) + + +def _disallow_var_creation(next_creator, **kw): + v = next_creator(**kw) + raise ValueError("Detected a variable being created during an eager " + "forward pass. Variables should only be created during " + "model initialization: {}".format(v.name)) + + +def build_eager_tf_policy(name, + loss_fn, + get_default_config=None, + postprocess_fn=None, + stats_fn=None, + optimizer_fn=None, + gradients_fn=None, + apply_gradients_fn=None, + grad_stats_fn=None, + extra_learn_fetches_fn=None, + extra_action_fetches_fn=None, + before_init=None, + before_loss_init=None, + after_init=None, + make_model=None, + action_sampler_fn=None, + mixins=None, + obs_include_prev_action_reward=True, + get_batch_divisibility_req=None): + """Build an eager TF policy. + + An eager policy runs all operations in eager mode, which makes debugging + much simpler, but is lower performance. + + You shouldn't need to call this directly. Rather, prefer to build a TF + graph policy and use set {"eager": true} in the trainer config to have + it automatically be converted to an eager policy. + + This has the same signature as build_tf_policy().""" + + base = add_mixins(Policy, mixins) + + class eager_policy_cls(base): + def __init__(self, observation_space, action_space, config): + assert tf.executing_eagerly() + Policy.__init__(self, observation_space, action_space, config) + self._is_training = False + self._loss_initialized = False + self._sess = None + + if get_default_config: + config = dict(get_default_config(), **config) + + if before_init: + before_init(self, observation_space, action_space, config) + + self.config = config + + if action_sampler_fn: + if not make_model: + raise ValueError( + "make_model is required if action_sampler_fn is given") + self._dist_class = None + else: + self._dist_class, logit_dim = ModelCatalog.get_action_dist( + action_space, self.config["model"]) + + if make_model: + self.model = make_model(self, observation_space, action_space, + config) + else: + self.model = ModelCatalog.get_model_v2( + observation_space, + action_space, + logit_dim, + config["model"], + framework="tf", + ) + + self.model({ + SampleBatch.CUR_OBS: tf.convert_to_tensor( + np.array([observation_space.sample()])), + SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( + [_flatten_action(action_space.sample())]), + SampleBatch.PREV_REWARDS: tf.convert_to_tensor([0.]), + }, [ + tf.convert_to_tensor([s]) + for s in self.model.get_initial_state() + ], tf.convert_to_tensor([1])) + + if before_loss_init: + before_loss_init(self, observation_space, action_space, config) + + self._initialize_loss_with_dummy_batch() + self._loss_initialized = True + + if optimizer_fn: + self._optimizer = optimizer_fn(self, config) + else: + self._optimizer = tf.train.AdamOptimizer(config["lr"]) + + if after_init: + after_init(self, observation_space, action_space, config) + + @override(Policy) + def postprocess_trajectory(self, + samples, + other_agent_batches=None, + episode=None): + assert tf.executing_eagerly() + if postprocess_fn: + return postprocess_fn(self, samples) + else: + return samples + + @override(Policy) + def learn_on_batch(self, samples): + with tf.variable_creator_scope(_disallow_var_creation): + grads_and_vars, stats = self._compute_gradients(samples) + self._apply_gradients(grads_and_vars) + return stats + + @override(Policy) + def compute_gradients(self, samples): + with tf.variable_creator_scope(_disallow_var_creation): + grads_and_vars, stats = self._compute_gradients(samples) + grads = [g for g, v in grads_and_vars] + grads = [(g.numpy() if g is not None else None) for g in grads] + return grads, stats + + @override(Policy) + def compute_actions(self, + obs_batch, + state_batches, + prev_action_batch=None, + prev_reward_batch=None, + info_batch=None, + episodes=None, + **kwargs): + + assert tf.executing_eagerly() + self._is_training = False + + self._seq_lens = tf.ones(len(obs_batch)) + self._input_dict = { + SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_batch), + "is_training": tf.convert_to_tensor(False), + } + if obs_include_prev_action_reward: + self._input_dict.update({ + SampleBatch.PREV_ACTIONS: tf.convert_to_tensor( + prev_action_batch), + SampleBatch.PREV_REWARDS: tf.convert_to_tensor( + prev_reward_batch), + }) + self._state_in = state_batches + with tf.variable_creator_scope(_disallow_var_creation): + model_out, state_out = self.model( + self._input_dict, state_batches, self._seq_lens) + + if self._dist_class: + action_dist = self._dist_class(model_out, self.model) + action = action_dist.sample().numpy() + logp = action_dist.sampled_action_logp() + else: + action, logp = action_sampler_fn( + self, self.model, self._input_dict, self.observation_space, + self.action_space, self.config) + action = action.numpy() + + fetches = {} + if logp is not None: + fetches.update({ + ACTION_PROB: tf.exp(logp).numpy(), + ACTION_LOGP: logp.numpy(), + }) + if extra_action_fetches_fn: + fetches.update(extra_action_fetches_fn(self)) + return action, state_out, fetches + + @override(Policy) + def apply_gradients(self, gradients): + self._apply_gradients( + zip([(tf.convert_to_tensor(g) if g is not None else None) + for g in gradients], self.model.trainable_variables())) + + @override(Policy) + def get_weights(self): + return tf.nest.map_structure(lambda var: var.numpy(), + self.model.variables()) + + @override(Policy) + def set_weights(self, weights): + tf.nest.map_structure(lambda var, value: var.assign(value), + self.model.variables(), weights) + + def is_recurrent(self): + return len(self._state_in) > 0 + + def num_state_tensors(self): + return len(self._state_in) + + def get_session(self): + return None # None implies eager + + def loss_initialized(self): + return self._loss_initialized + + def _get_is_training_placeholder(self): + return tf.convert_to_tensor(self._is_training) + + def _apply_gradients(self, grads_and_vars): + if apply_gradients_fn: + apply_gradients_fn(self, self._optimizer, grads_and_vars) + else: + self._optimizer.apply_gradients(grads_and_vars) + + def _compute_gradients(self, samples): + """Computes and returns grads as eager tensors.""" + + self._is_training = True + + samples = { + k: tf.convert_to_tensor(v) + for k, v in samples.items() if v.dtype != np.object + } + + with tf.GradientTape(persistent=gradients_fn is not None) as tape: + # TODO: set seq len and state in properly + self._seq_lens = tf.ones(len(samples[SampleBatch.CUR_OBS])) + self._state_in = [] + model_out, _ = self.model(samples, self._state_in, + self._seq_lens) + loss = loss_fn(self, self.model, self._dist_class, samples) + + variables = self.model.trainable_variables() + + if gradients_fn: + + class OptimizerWrapper(object): + def __init__(self, tape): + self.tape = tape + + def compute_gradients(self, loss, var_list): + return list( + zip(self.tape.gradient(loss, var_list), var_list)) + + grads_and_vars = gradients_fn(self, OptimizerWrapper(tape), + loss) + else: + grads_and_vars = list( + zip(tape.gradient(loss, variables), variables)) + + if log_once("grad_vars"): + for _, v in grads_and_vars: + logger.info("Optimizing variable {}".format(v.name)) + + grads = [g for g, v in grads_and_vars] + stats = self._stats(self, samples, grads) + return grads_and_vars, stats + + def _stats(self, outputs, samples, grads): + assert tf.executing_eagerly() + fetches = {} + if stats_fn: + fetches[LEARNER_STATS_KEY] = { + k: v.numpy() + for k, v in stats_fn(outputs, samples).items() + } + else: + fetches[LEARNER_STATS_KEY] = {} + if extra_learn_fetches_fn: + fetches.update({ + k: v.numpy() + for k, v in extra_learn_fetches_fn(self).items() + }) + if grad_stats_fn: + fetches.update({ + k: v.numpy() + for k, v in grad_stats_fn(self, samples, grads).items() + }) + return fetches + + def _initialize_loss_with_dummy_batch(self): + # Dummy forward pass to initialize any policy attributes, etc. + action_dtype, action_shape = ModelCatalog.get_action_shape( + self.action_space) + dummy_batch = { + SampleBatch.CUR_OBS: tf.convert_to_tensor( + np.array([self.observation_space.sample()])), + SampleBatch.NEXT_OBS: tf.convert_to_tensor( + np.array([self.observation_space.sample()])), + SampleBatch.DONES: tf.convert_to_tensor( + np.array([False], dtype=np.bool)), + SampleBatch.ACTIONS: tf.convert_to_tensor( + np.zeros( + (1, ) + action_shape[1:], + dtype=action_dtype.as_numpy_dtype())), + SampleBatch.REWARDS: tf.convert_to_tensor( + np.array([0], dtype=np.float32)), + } + if obs_include_prev_action_reward: + dummy_batch.update({ + SampleBatch.PREV_ACTIONS: dummy_batch[SampleBatch.ACTIONS], + SampleBatch.PREV_REWARDS: dummy_batch[SampleBatch.REWARDS], + }) + state_init = self.get_initial_state() + state_batches = [] + for i, h in enumerate(state_init): + dummy_batch["state_in_{}".format(i)] = tf.convert_to_tensor( + np.expand_dims(h, 0)) + dummy_batch["state_out_{}".format(i)] = tf.convert_to_tensor( + np.expand_dims(h, 0)) + state_batches.append( + tf.convert_to_tensor(np.expand_dims(h, 0))) + if state_init: + dummy_batch["seq_lens"] = tf.convert_to_tensor( + np.array([1], dtype=np.int32)) + + # for IMPALA which expects a certain sample batch size + def tile_to(tensor, n): + return tf.tile(tensor, + [n] + [1 for _ in tensor.shape.as_list()[1:]]) + + if get_batch_divisibility_req: + dummy_batch = { + k: tile_to(v, get_batch_divisibility_req(self)) + for k, v in dummy_batch.items() + } + + # Execute a forward pass to get self.action_dist etc initialized, + # and also obtain the extra action fetches + _, _, fetches = self.compute_actions( + dummy_batch[SampleBatch.CUR_OBS], state_batches, + dummy_batch.get(SampleBatch.PREV_ACTIONS), + dummy_batch.get(SampleBatch.PREV_REWARDS)) + dummy_batch.update(fetches) + + postprocessed_batch = self.postprocess_trajectory( + SampleBatch(dummy_batch)) + + # model forward pass for the loss (needed after postprocess to + # overwrite any tensor state from that call) + self.model.from_batch(dummy_batch) + + postprocessed_batch = { + k: tf.convert_to_tensor(v) + for k, v in postprocessed_batch.items() + } + + loss_fn(self, self.model, self._dist_class, postprocessed_batch) + if stats_fn: + stats_fn(self, postprocessed_batch) + + eager_policy_cls.__name__ = name + "_eager" + eager_policy_cls.__qualname__ = name + "_eager" + return eager_policy_cls diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index dcf19db14..2458ce5b0 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -12,8 +12,15 @@ from ray.rllib.utils.annotations import DeveloperAPI # `grad_info` dict returned by learn_on_batch() / compute_grads() via this key. LEARNER_STATS_KEY = "learner_stats" -# Used to return tuple actions as a list of batches per tuple element -TupleActions = namedtuple("TupleActions", ["batches"]) + +class TupleActions(namedtuple("TupleActions", ["batches"])): + """Used to return tuple actions as a list of batches per tuple element.""" + + def __new__(cls, batches): + return super(TupleActions, cls).__new__(cls, batches) + + def numpy(self): + return TupleActions([b.numpy() for b in self.batches]) @DeveloperAPI diff --git a/rllib/policy/sample_batch.py b/rllib/policy/sample_batch.py index 01c1f9b8e..1f36ed6a2 100644 --- a/rllib/policy/sample_batch.py +++ b/rllib/policy/sample_batch.py @@ -58,7 +58,8 @@ class SampleBatch(object): self.data[k] = np.array(v, copy=False) if not lengths: raise ValueError("Empty sample batch") - assert len(set(lengths)) == 1, "data columns must be same length" + assert len(set(lengths)) == 1, ("data columns must be same length", + self.data, lengths) self.count = lengths[0] @staticmethod diff --git a/rllib/policy/tf_policy_template.py b/rllib/policy/tf_policy_template.py index 94db613fb..980f5e749 100644 --- a/rllib/policy/tf_policy_template.py +++ b/rllib/policy/tf_policy_template.py @@ -3,10 +3,14 @@ from __future__ import division from __future__ import print_function from ray.rllib.policy.dynamic_tf_policy import DynamicTFPolicy +from ray.rllib.policy import eager_tf_policy from ray.rllib.policy.policy import Policy, LEARNER_STATS_KEY from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.utils import add_mixins from ray.rllib.utils.annotations import override, DeveloperAPI +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() @DeveloperAPI @@ -20,9 +24,7 @@ def build_tf_policy(name, apply_gradients_fn=None, grad_stats_fn=None, extra_action_fetches_fn=None, - extra_action_feed_fn=None, extra_learn_fetches_fn=None, - extra_learn_feed_fn=None, before_init=None, before_loss_init=None, after_init=None, @@ -42,8 +44,9 @@ def build_tf_policy(name, This means that you can e.g., depend on any policy attributes created in the running of `loss_fn` in later functions such as `stats_fn`. - In eager mode (to be implemented), the following functions will be run - repeatedly on each eager execution: loss_fn, stats_fn + In eager mode, the following functions will be run repeatedly on each + eager execution: loss_fn, stats_fn, gradients_fn, apply_gradients_fn, + and grad_stats_fn. This means that these functions should not define any variables internally, otherwise they will fail in eager mode execution. Variable should only @@ -51,8 +54,8 @@ def build_tf_policy(name, Arguments: name (str): name of the policy (e.g., "PPOTFPolicy") - loss_fn (func): function that returns a loss tensor the policy, - and dict of experience tensor placeholdes + loss_fn (func): function that returns a loss tensor as arguments + (policy, model, dist_class, train_batch) get_default_config (func): optional function that returns the default config to merge with any overrides postprocess_fn (func): optional experience postprocessing function @@ -70,12 +73,8 @@ def build_tf_policy(name, TF fetches given the policy, batch input, and gradient tensors extra_action_fetches_fn (func): optional function that returns a dict of TF fetches given the policy object - extra_action_feed_fn (func): optional function that returns a feed dict - to also feed to TF when computing actions extra_learn_fetches_fn (func): optional function that returns a dict of extra values to fetch and return when learning on a batch - extra_learn_feed_fn (func): optional function that returns a feed dict - to also feed to TF when learning on a batch before_init (func): optional function to run at the beginning of policy init that takes the same arguments as the policy constructor before_loss_init (func): optional function to run prior to loss @@ -183,13 +182,6 @@ def build_tf_policy(name, TFPolicy.extra_compute_action_fetches(self), **self._extra_action_fetches) - @override(TFPolicy) - def extra_compute_action_feed_dict(self): - if extra_action_feed_fn: - return extra_action_feed_fn(self) - else: - return TFPolicy.extra_compute_action_feed_dict(self) - @override(TFPolicy) def extra_compute_grad_fetches(self): if extra_learn_fetches_fn: @@ -200,18 +192,16 @@ def build_tf_policy(name, else: return TFPolicy.extra_compute_grad_fetches(self) - @override(TFPolicy) - def extra_compute_grad_feed_dict(self): - if extra_learn_feed_fn: - return extra_learn_feed_fn(self) - else: - return TFPolicy.extra_compute_grad_feed_dict(self) - @staticmethod def with_updates(**overrides): return build_tf_policy(**dict(original_kwargs, **overrides)) + @staticmethod + def as_eager(): + return eager_tf_policy.build_eager_tf_policy(**original_kwargs) + policy_cls.with_updates = with_updates + policy_cls.as_eager = as_eager policy_cls.__name__ = name policy_cls.__qualname__ = name return policy_cls diff --git a/rllib/policy/torch_policy.py b/rllib/policy/torch_policy.py index aff405e33..4fe7991ed 100644 --- a/rllib/policy/torch_policy.py +++ b/rllib/policy/torch_policy.py @@ -42,8 +42,8 @@ class TorchPolicy(Policy): model (nn.Module): PyTorch policy module. Given observations as input, this module must return a list of outputs where the first item is action logits, and the rest can be any value. - loss (func): Function that takes (policy, batch_tensors) - and returns a single scalar loss. + loss (func): Function that takes (policy, model, dist_class, + train_batch) and returns a single scalar loss. action_distribution_class (ActionDistribution): Class for action distribution. """ @@ -87,26 +87,28 @@ class TorchPolicy(Policy): @override(Policy) def learn_on_batch(self, postprocessed_batch): - batch_tensors = self._lazy_tensor_dict(postprocessed_batch) + train_batch = self._lazy_tensor_dict(postprocessed_batch) with self.lock: - loss_out = self._loss(self, batch_tensors) + loss_out = self._loss(self, self.model, self.dist_class, + train_batch) self._optimizer.zero_grad() loss_out.backward() grad_process_info = self.extra_grad_process() self._optimizer.step() - grad_info = self.extra_grad_info(batch_tensors) + grad_info = self.extra_grad_info(train_batch) grad_info.update(grad_process_info) return {LEARNER_STATS_KEY: grad_info} @override(Policy) def compute_gradients(self, postprocessed_batch): - batch_tensors = self._lazy_tensor_dict(postprocessed_batch) + train_batch = self._lazy_tensor_dict(postprocessed_batch) with self.lock: - loss_out = self._loss(self, batch_tensors) + loss_out = self._loss(self, self.model, self.dist_class, + train_batch) self._optimizer.zero_grad() loss_out.backward() @@ -121,7 +123,7 @@ class TorchPolicy(Policy): else: grads.append(None) - grad_info = self.extra_grad_info(batch_tensors) + grad_info = self.extra_grad_info(train_batch) grad_info.update(grad_process_info) return grads, {LEARNER_STATS_KEY: grad_info} @@ -161,7 +163,7 @@ class TorchPolicy(Policy): model (TorchModelV2): Reference to the model.""" return {} - def extra_grad_info(self, batch_tensors): + def extra_grad_info(self, train_batch): """Return dict of extra grad info.""" return {} @@ -175,7 +177,7 @@ class TorchPolicy(Policy): return torch.optim.Adam(self._model.parameters()) def _lazy_tensor_dict(self, postprocessed_batch): - batch_tensors = UsageTrackingDict(postprocessed_batch) + train_batch = UsageTrackingDict(postprocessed_batch) def convert(arr): tensor = torch.from_numpy(np.asarray(arr)) @@ -183,5 +185,5 @@ class TorchPolicy(Policy): tensor = tensor.float() return tensor.to(self.device) - batch_tensors.set_get_interceptor(convert) - return batch_tensors + train_batch.set_get_interceptor(convert) + return train_batch diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py index d3d7e6987..b4e978e0b 100644 --- a/rllib/policy/torch_policy_template.py +++ b/rllib/policy/torch_policy_template.py @@ -26,8 +26,8 @@ def build_torch_policy(name, Arguments: name (str): name of the policy (e.g., "PPOTorchPolicy") - loss_fn (func): function that returns a loss tensor the policy, - and dict of experience tensor placeholders + loss_fn (func): function that returns a loss tensor as arguments + (policy, model, dist_class, train_batch) get_default_config (func): optional function that returns the default config to merge with any overrides stats_fn (func): optional function that returns a dict of @@ -121,11 +121,11 @@ def build_torch_policy(name, return TorchPolicy.optimizer(self) @override(TorchPolicy) - def extra_grad_info(self, batch_tensors): + def extra_grad_info(self, train_batch): if stats_fn: - return stats_fn(self, batch_tensors) + return stats_fn(self, train_batch) else: - return TorchPolicy.extra_grad_info(self, batch_tensors) + return TorchPolicy.extra_grad_info(self, train_batch) @staticmethod def with_updates(**overrides): diff --git a/rllib/tests/test_eager_support.py b/rllib/tests/test_eager_support.py new file mode 100644 index 000000000..9bcb7212b --- /dev/null +++ b/rllib/tests/test_eager_support.py @@ -0,0 +1,92 @@ +import unittest + +import ray +from ray import tune +from ray.rllib.agents.registry import get_agent_class + + +def check_support(alg, config): + config["eager"] = True + if alg in ["APEX_DDPG", "TD3", "DDPG", "SAC"]: + config["env"] = "Pendulum-v0" + else: + config["env"] = "CartPole-v0" + a = get_agent_class(alg) + tune.run(a, config=config, stop={"training_iteration": 0}) + + +class TestEagerSupport(unittest.TestCase): + def setUp(self): + ray.init(num_cpus=4) + + def tearDown(self): + ray.shutdown() + + def testSimpleQ(self): + check_support("SimpleQ", {"num_workers": 0, "learning_starts": 0}) + + def testDQN(self): + check_support("DQN", {"num_workers": 0, "learning_starts": 0}) + + def testA2C(self): + check_support("A2C", {"num_workers": 0}) + + def testA3C(self): + check_support("A3C", {"num_workers": 1}) + + def testPG(self): + check_support("PG", {"num_workers": 0}) + + def testPPO(self): + check_support("PPO", {"num_workers": 0}) + + def testAPPO(self): + check_support("APPO", {"num_workers": 1, "num_gpus": 0}) + + def testIMPALA(self): + check_support("IMPALA", {"num_workers": 1, "num_gpus": 0}) + + def testAPEX_DQN(self): + check_support( + "APEX", { + "num_workers": 2, + "learning_starts": 0, + "num_gpus": 0, + "min_iter_time_s": 1, + "timesteps_per_iteration": 100 + }) + + def testDDPG(self): + check_support("DDPG", { + "num_workers": 0, + "learning_starts": 0, + "timesteps_per_iteration": 10 + }) + + def testTD3(self): + check_support("TD3", { + "num_workers": 0, + "learning_starts": 0, + "timesteps_per_iteration": 10 + }) + + def testAPEX_DDPG(self): + check_support( + "APEX_DDPG", { + "num_workers": 2, + "learning_starts": 0, + "num_gpus": 0, + "min_iter_time_s": 1, + "timesteps_per_iteration": 100 + }) + + def testSAC(self): + check_support("SAC", { + "num_workers": 0, + "learning_starts": 0, + "timesteps_per_iteration": 100 + }) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index b091776fa..b007679fb 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -69,6 +69,7 @@ def make_stub_env(action_space, obs_space, check_action_bounds): def check_support(alg, config, stats, check_bounds=False, name=None): covered_a = set() covered_o = set() + config["log_level"] = "ERROR" for a_name, action_space in ACTION_SPACES_TO_TEST.items(): for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items(): print("=== Testing", alg, action_space, obs_space, "===") diff --git a/rllib/train.py b/rllib/train.py index 8f6a0c2b3..c726e7dc0 100755 --- a/rllib/train.py +++ b/rllib/train.py @@ -92,6 +92,10 @@ def create_parser(parser_creator=None): "--resume", action="store_true", help="Whether to attempt to resume previous Tune experiments.") + parser.add_argument( + "--eager", + action="store_true", + help="Whether to attempt to enable TF eager execution.") parser.add_argument( "--env", default=None, type=str, help="The gym environment to use.") parser.add_argument( @@ -140,6 +144,8 @@ def run(args, parser): parser.error("the following arguments are required: --run") if not exp.get("env") and not exp.get("config", {}).get("env"): parser.error("the following arguments are required: --env") + if args.eager: + exp["config"]["eager"] = True if args.ray_num_nodes: cluster = Cluster() diff --git a/rllib/utils/tf_ops.py b/rllib/utils/tf_ops.py index 62c73e782..b15f1cf58 100644 --- a/rllib/utils/tf_ops.py +++ b/rllib/utils/tf_ops.py @@ -34,6 +34,68 @@ def minimize_and_clip(optimizer, objective, var_list, clip_val=10): return gradients +def make_tf_callable(session_or_none, dynamic_shape=False): + """Returns a function that can be executed in either graph or eager mode. + + The function must take only positional args. + + If eager is enabled, this will act as just a function. Otherwise, it + will build a function that executes a session run with placeholders + internally. + + Arguments: + session_or_none (tf.Session): tf.Session if in graph mode, else None. + dynamic_shape (bool): True if the placeholders should have a dynamic + batch dimension. Otherwise they will be fixed shape. + + Returns: + a Python function that can be called in either mode. + """ + + if tf.executing_eagerly(): + assert session_or_none is None + else: + assert session_or_none is not None + + def make_wrapper(fn): + if session_or_none: + placeholders = [] + symbolic_out = [None] + + def call(*args): + args_flat = [] + for a in args: + if type(a) is list: + args_flat.extend(a) + else: + args_flat.append(a) + args = args_flat + if not placeholders: + with session_or_none.graph.as_default(): + for i, v in enumerate(args): + if dynamic_shape: + if len(v.shape) > 0: + shape = (None, ) + v.shape[1:] + else: + shape = () + else: + shape = v.shape + placeholders.append( + tf.placeholder( + dtype=v.dtype, + shape=shape, + name="arg_{}".format(i))) + symbolic_out[0] = fn(*placeholders) + feed_dict = dict(zip(placeholders, args)) + return session_or_none.run(symbolic_out[0], feed_dict) + + return call + else: + return fn + + return make_wrapper + + def scope_vars(scope, trainable_only=False): """ Get variables inside a scope