From 9e328fbe6f94a069b31cd98511244694efc96f92 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Fri, 7 Jun 2019 16:42:37 -0700 Subject: [PATCH] [rllib] Add docs on how to use TF eager execution (#4927) --- ci/jenkins_tests/run_rllib_tests.sh | 10 ++ doc/source/rllib-concepts.rst | 31 ++++++ doc/source/rllib-examples.rst | 2 + doc/source/rllib-training.rst | 7 ++ doc/source/rllib.rst | 2 + python/ray/rllib/agents/a3c/a3c_tf_policy.py | 5 +- python/ray/rllib/agents/ppo/ppo_policy.py | 10 +- python/ray/rllib/agents/trainer.py | 3 + python/ray/rllib/examples/eager_execution.py | 101 +++++++++++++++++++ python/ray/rllib/policy/dynamic_tf_policy.py | 50 +++++++++ 10 files changed, 215 insertions(+), 6 deletions(-) create mode 100644 python/ray/rllib/examples/eager_execution.py diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index a97bf5517..13036ae7d 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -392,6 +392,16 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/rollout_worker_custom_workflow.py +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/python/ray/rllib/examples/eager_execution.py --iters=2 + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output /ray/python/ray/rllib/train.py \ + --env CartPole-v0 \ + --run PPO \ + --stop '{"training_iteration": 1}' \ + --config '{"use_eager": true, "simple_optimizer": true}' + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_tf_policy.py --iters=2 diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index b7b3ff823..4b00f5636 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -346,6 +346,37 @@ In PPO we run ``setup_mixins`` before the loss function is called (i.e., ``befor Finally, note that you do not have to use ``build_tf_policy`` to define a TensorFlow policy. You can alternatively subclass ``Policy``, ``TFPolicy``, or ``DynamicTFPolicy`` as convenient. +Building Policies in TensorFlow Eager +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While RLlib runs all TF operations in graph mode, you can still leverage TensorFlow eager using `tf.py_function `__. However, note that eager and non-eager tensors cannot be mixed within the ``py_function``. Here's an example of embedding eager execution within a policy loss function: + +.. code-block:: python + + def eager_loss(policy, batch_tensors): + """Example of using embedded eager execution in a custom loss. + + Here `compute_penalty` prints the actions and rewards for debugging, and + also computes a (dummy) penalty term to add to the loss. + """ + + def compute_penalty(actions, rewards): + penalty = tf.reduce_mean(tf.cast(actions, tf.float32)) + if random.random() > 0.9: + print("The eagerly computed penalty is", penalty, actions, rewards) + return penalty + + actions = batch_tensors[SampleBatch.ACTIONS] + rewards = batch_tensors[SampleBatch.REWARDS] + penalty = tf.py_function( + compute_penalty, [actions, rewards], Tout=tf.float32) + + return penalty - tf.reduce_mean(policy.action_dist.logp(actions) * rewards) + +You can find a runnable file for the above eager execution example `here `__. + +There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true, "simple_optimizer": true}'``. However this currently only works for a couple algorithms. + Building Policies in PyTorch ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index 13bfdc68b..604abf394 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -38,6 +38,8 @@ Custom Envs and Models Example of adding batch norm layers to a custom model. - `Parametric actions `__: Example of how to handle variable-length or parametric action spaces. +- `Eager execution `__: + Example of how to leverage TensorFlow eager to simplify debugging and design of custom models and policies. Serving and Offline ------------------- diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 824ef4c3d..9c365f8fb 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -367,6 +367,13 @@ The ``"monitor": true`` config can be used to save Gym episode videos to the res openaigym.video.0.31403.video000000.meta.json openaigym.video.0.31403.video000000.mp4 +TensorFlow Eager +~~~~~~~~~~~~~~~~ + +While RLlib uses TF graph mode for all computations, you can still leverage TF eager to inspect the intermediate state of computations using `tf.py_function `__. Here's an example of using eager mode in `a custom RLlib model and loss `__. + +There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true, "simple_optimizer": true}'``. However this currently only works for a couple algorithms. + Episode Traces ~~~~~~~~~~~~~~ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index f0571b23c..d0d9d715a 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -101,6 +101,8 @@ Concepts and Building Custom Algorithms - `Building Policies in TensorFlow `__ + - `Building Policies in TensorFlow Eager `__ + - `Building Policies in PyTorch `__ - `Extending Existing Policies `__ diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py index ed3676472..d05f496a7 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py @@ -41,8 +41,9 @@ def actor_critic_loss(policy, batch_tensors): policy.loss = A3CLoss( policy.action_dist, batch_tensors[SampleBatch.ACTIONS], batch_tensors[Postprocessing.ADVANTAGES], - batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf, - policy.config["vf_loss_coeff"], policy.config["entropy_coeff"]) + batch_tensors[Postprocessing.VALUE_TARGETS], + policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"], + policy.config["entropy_coeff"]) return policy.loss.total_loss diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py index 4b391cab2..ad79d90fa 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy.py +++ b/python/ray/rllib/agents/ppo/ppo_policy.py @@ -106,8 +106,10 @@ class PPOLoss(object): def ppo_surrogate_loss(policy, batch_tensors): if policy.model.state_in: - max_seq_len = tf.reduce_max(policy.model.seq_lens) - mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len) + max_seq_len = tf.reduce_max( + policy.convert_to_eager(policy.model.seq_lens)) + mask = tf.sequence_mask( + policy.convert_to_eager(policy.model.seq_lens), max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like( @@ -121,8 +123,8 @@ def ppo_surrogate_loss(policy, batch_tensors): batch_tensors[BEHAVIOUR_LOGITS], batch_tensors[SampleBatch.VF_PREDS], policy.action_dist, - policy.value_function, - policy.kl_coeff, + policy.convert_to_eager(policy.value_function), + policy.convert_to_eager(policy.kl_coeff), mask, entropy_coeff=policy.config["entropy_coeff"], clip_param=policy.config["clip_param"], diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py index f08b23e93..a0d48d2ef 100644 --- a/python/ray/rllib/agents/trainer.py +++ b/python/ray/rllib/agents/trainer.py @@ -67,6 +67,9 @@ COMMON_CONFIG = { }, # Whether to attempt to continue training if a worker crashes. "ignore_worker_failures": False, + # Execute TF loss functions in eager mode. This is currently experimental + # and only really works with the basic PG algorithm. + "use_eager": False, # === Policy === # Arguments to pass to model. See models/catalog.py for a full list of the diff --git a/python/ray/rllib/examples/eager_execution.py b/python/ray/rllib/examples/eager_execution.py new file mode 100644 index 000000000..a3c418a33 --- /dev/null +++ b/python/ray/rllib/examples/eager_execution.py @@ -0,0 +1,101 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import random + +import ray +from ray import tune +from ray.rllib.agents.trainer_template import build_trainer +from ray.rllib.models import FullyConnectedNetwork, Model, ModelCatalog +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.tf_policy_template import build_tf_policy +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + +parser = argparse.ArgumentParser() +parser.add_argument("--iters", type=int, default=200) + + +class EagerModel(Model): + """Example of using embedded eager execution in a custom model. + + This shows how to use tf.py_function() to execute a snippet of TF code + in eager mode. Here the `self.forward_eager` method just prints out + the intermediate tensor for debug purposes, but you can in general + perform any TF eager operation in tf.py_function(). + """ + + def _build_layers_v2(self, input_dict, num_outputs, options): + self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space, + self.action_space, num_outputs, + options) + feature_out = tf.py_function(self.forward_eager, + [self.fcnet.last_layer], tf.float32) + + with tf.control_dependencies([feature_out]): + return tf.identity(self.fcnet.outputs), feature_out + + def forward_eager(self, feature_layer): + assert tf.executing_eagerly() + if random.random() > 0.99: + print("Eagerly printing the feature layer mean value", + tf.reduce_mean(feature_layer)) + return feature_layer + + +def policy_gradient_loss(policy, batch_tensors): + """Example of using embedded eager execution in a custom loss. + + Here `compute_penalty` prints the actions and rewards for debugging, and + also computes a (dummy) penalty term to add to the loss. + + Alternatively, you can set config["use_eager"] = True, which will try to + automatically eagerify the entire loss function. However, this only works + if your loss doesn't reference any non-eager tensors. It also won't work + with the multi-GPU optimizer used by PPO. + """ + + def compute_penalty(actions, rewards): + assert tf.executing_eagerly() + penalty = tf.reduce_mean(tf.cast(actions, tf.float32)) + if random.random() > 0.9: + print("The eagerly computed penalty is", penalty, actions, rewards) + return penalty + + actions = batch_tensors[SampleBatch.ACTIONS] + rewards = batch_tensors[SampleBatch.REWARDS] + penalty = tf.py_function( + compute_penalty, [actions, rewards], Tout=tf.float32) + + return penalty - tf.reduce_mean(policy.action_dist.logp(actions) * rewards) + + +# +MyTFPolicy = build_tf_policy( + name="MyTFPolicy", + loss_fn=policy_gradient_loss, +) + +# +MyTrainer = build_trainer( + name="MyCustomTrainer", + default_policy=MyTFPolicy, +) + +if __name__ == "__main__": + ray.init() + args = parser.parse_args() + ModelCatalog.register_custom_model("eager_model", EagerModel) + tune.run( + MyTrainer, + stop={"training_iteration": args.iters}, + config={ + "env": "CartPole-v0", + "num_workers": 0, + "model": { + "custom_model": "eager_model" + }, + }) diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py index 0240f275d..23014553b 100644 --- a/python/ray/rllib/policy/dynamic_tf_policy.py +++ b/python/ray/rllib/policy/dynamic_tf_policy.py @@ -167,6 +167,8 @@ class DynamicTFPolicy(TFPolicy): batch_divisibility_req=batch_divisibility_req) # Phase 2 init + self._needs_eager_conversion = set() + self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss() @@ -178,10 +180,26 @@ class DynamicTFPolicy(TFPolicy): """ return self.input_dict + def convert_to_eager(self, tensor): + """Convert a graph tensor accessed in the loss to an eager tensor. + + Experimental. + """ + if tf.executing_eagerly(): + return self._eager_tensors[tensor] + else: + self._needs_eager_conversion.add(tensor) + return tensor + @override(TFPolicy) def copy(self, existing_inputs): """Creates a copy of self using existing input placeholders.""" + if self.config["use_eager"]: + raise ValueError( + "eager not implemented for multi-GPU, try setting " + "`simple_optimizer: true`") + # Note that there might be RNN state inputs at the end of the list if self._state_inputs: num_state_inputs = len(self._state_inputs) + 1 @@ -297,6 +315,38 @@ class DynamicTFPolicy(TFPolicy): loss = self._do_loss_init(batch_tensors) for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) + + # XXX experimental support for automatically eagerifying the loss. + # The main limitation right now is that TF doesn't support mixing eager + # and non-eager tensors, so losses that read non-eager tensors through + # `policy` need to use `policy.convert_to_eager(tensor)`. + if self.config["use_eager"]: + if not self.model: + raise ValueError("eager not implemented in this case") + graph_tensors = list(self._needs_eager_conversion) + + def gen_loss(model_outputs, *args): + # fill in the batch tensor dict with eager ensors + eager_inputs = dict( + zip([k for (k, v) in loss_inputs], + args[:len(loss_inputs)])) + # fill in the eager versions of all accessed graph tensors + self._eager_tensors = dict( + zip(graph_tensors, args[len(loss_inputs):])) + # patch the action dist to use eager mode tensors + self.action_dist.inputs = model_outputs + return self._loss_fn(self, eager_inputs) + + # TODO(ekl) also handle the stats funcs + loss = tf.py_function( + gen_loss, + # cast works around TypeError: Cannot convert provided value + # to EagerTensor. Provided value: 0.0 Requested dtype: int64 + [self.model.outputs] + [ + tf.cast(v, tf.float32) for (k, v) in loss_inputs + ] + [tf.cast(t, tf.float32) for t in graph_tensors], + tf.float32) + TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update(self._grad_stats_fn(self, self._grads))