From 9e328fbe6f94a069b31cd98511244694efc96f92 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Fri, 7 Jun 2019 16:42:37 -0700
Subject: [PATCH] [rllib] Add docs on how to use TF eager execution (#4927)

---
 ci/jenkins_tests/run_rllib_tests.sh          |  10 ++
 doc/source/rllib-concepts.rst                |  31 ++++++
 doc/source/rllib-examples.rst                |   2 +
 doc/source/rllib-training.rst                |   7 ++
 doc/source/rllib.rst                         |   2 +
 python/ray/rllib/agents/a3c/a3c_tf_policy.py |   5 +-
 python/ray/rllib/agents/ppo/ppo_policy.py    |  10 +-
 python/ray/rllib/agents/trainer.py           |   3 +
 python/ray/rllib/examples/eager_execution.py | 101 +++++++++++++++++++
 python/ray/rllib/policy/dynamic_tf_policy.py |  50 +++++++++
 10 files changed, 215 insertions(+), 6 deletions(-)
 create mode 100644 python/ray/rllib/examples/eager_execution.py

diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh
index a97bf5517..13036ae7d 100644
--- a/ci/jenkins_tests/run_rllib_tests.sh
+++ b/ci/jenkins_tests/run_rllib_tests.sh
@@ -392,6 +392,16 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/examples/rollout_worker_custom_workflow.py
 
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/python/ray/rllib/examples/eager_execution.py --iters=2
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output /ray/python/ray/rllib/train.py \
+    --env CartPole-v0 \
+    --run PPO \
+    --stop '{"training_iteration": 1}' \
+    --config '{"use_eager": true, "simple_optimizer": true}'
+
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_tf_policy.py --iters=2
 
diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index b7b3ff823..4b00f5636 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -346,6 +346,37 @@ In PPO we run ``setup_mixins`` before the loss function is called (i.e., ``befor
 
 Finally, note that you do not have to use ``build_tf_policy`` to define a TensorFlow policy. You can alternatively subclass ``Policy``, ``TFPolicy``, or ``DynamicTFPolicy`` as convenient.
 
+Building Policies in TensorFlow Eager
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+While RLlib runs all TF operations in graph mode, you can still leverage TensorFlow eager using `tf.py_function <https://www.tensorflow.org/api_docs/python/tf/py_function>`__. However, note that eager and non-eager tensors cannot be mixed within the ``py_function``. Here's an example of embedding eager execution within a policy loss function:
+
+.. code-block:: python
+
+    def eager_loss(policy, batch_tensors):
+        """Example of using embedded eager execution in a custom loss.
+
+        Here `compute_penalty` prints the actions and rewards for debugging, and
+        also computes a (dummy) penalty term to add to the loss.
+        """
+
+        def compute_penalty(actions, rewards):
+            penalty = tf.reduce_mean(tf.cast(actions, tf.float32))
+            if random.random() > 0.9:
+                print("The eagerly computed penalty is", penalty, actions, rewards)
+            return penalty
+
+        actions = batch_tensors[SampleBatch.ACTIONS]
+        rewards = batch_tensors[SampleBatch.REWARDS]
+        penalty = tf.py_function(
+            compute_penalty, [actions, rewards], Tout=tf.float32)
+
+        return penalty - tf.reduce_mean(policy.action_dist.logp(actions) * rewards)
+
+You can find a runnable file for the above eager execution example `here <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/eager_execution.py>`__.
+
+There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true, "simple_optimizer": true}'``. However this currently only works for a couple algorithms.
+
 Building Policies in PyTorch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst
index 13bfdc68b..604abf394 100644
--- a/doc/source/rllib-examples.rst
+++ b/doc/source/rllib-examples.rst
@@ -38,6 +38,8 @@ Custom Envs and Models
    Example of adding batch norm layers to a custom model.
 - `Parametric actions <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/parametric_action_cartpole.py>`__:
    Example of how to handle variable-length or parametric action spaces.
+- `Eager execution <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/eager_execution.py>`__:
+   Example of how to leverage TensorFlow eager to simplify debugging and design of custom models and policies.
 
 Serving and Offline
 -------------------
diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst
index 824ef4c3d..9c365f8fb 100644
--- a/doc/source/rllib-training.rst
+++ b/doc/source/rllib-training.rst
@@ -367,6 +367,13 @@ The ``"monitor": true`` config can be used to save Gym episode videos to the res
     openaigym.video.0.31403.video000000.meta.json
     openaigym.video.0.31403.video000000.mp4
 
+TensorFlow Eager
+~~~~~~~~~~~~~~~~
+
+While RLlib uses TF graph mode for all computations, you can still leverage TF eager to inspect the intermediate state of computations using `tf.py_function <https://www.tensorflow.org/api_docs/python/tf/py_function>`__. Here's an example of using eager mode in `a custom RLlib model and loss <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/eager_execution.py>`__.
+
+There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true, "simple_optimizer": true}'``. However this currently only works for a couple algorithms.
+
 Episode Traces
 ~~~~~~~~~~~~~~
 
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index f0571b23c..d0d9d715a 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -101,6 +101,8 @@ Concepts and Building Custom Algorithms
 
    -  `Building Policies in TensorFlow <rllib-concepts.html#building-policies-in-tensorflow>`__
 
+   -  `Building Policies in TensorFlow Eager <rllib-concepts.html#building-policies-in-tensorflow-eager>`__
+
    -  `Building Policies in PyTorch <rllib-concepts.html#building-policies-in-pytorch>`__
 
    -  `Extending Existing Policies <rllib-concepts.html#extending-existing-policies>`__
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
index ed3676472..d05f496a7 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
@@ -41,8 +41,9 @@ def actor_critic_loss(policy, batch_tensors):
     policy.loss = A3CLoss(
         policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
         batch_tensors[Postprocessing.ADVANTAGES],
-        batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
-        policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
+        batch_tensors[Postprocessing.VALUE_TARGETS],
+        policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"],
+        policy.config["entropy_coeff"])
     return policy.loss.total_loss
 
 
diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py
index 4b391cab2..ad79d90fa 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy.py
@@ -106,8 +106,10 @@ class PPOLoss(object):
 
 def ppo_surrogate_loss(policy, batch_tensors):
     if policy.model.state_in:
-        max_seq_len = tf.reduce_max(policy.model.seq_lens)
-        mask = tf.sequence_mask(policy.model.seq_lens, max_seq_len)
+        max_seq_len = tf.reduce_max(
+            policy.convert_to_eager(policy.model.seq_lens))
+        mask = tf.sequence_mask(
+            policy.convert_to_eager(policy.model.seq_lens), max_seq_len)
         mask = tf.reshape(mask, [-1])
     else:
         mask = tf.ones_like(
@@ -121,8 +123,8 @@ def ppo_surrogate_loss(policy, batch_tensors):
         batch_tensors[BEHAVIOUR_LOGITS],
         batch_tensors[SampleBatch.VF_PREDS],
         policy.action_dist,
-        policy.value_function,
-        policy.kl_coeff,
+        policy.convert_to_eager(policy.value_function),
+        policy.convert_to_eager(policy.kl_coeff),
         mask,
         entropy_coeff=policy.config["entropy_coeff"],
         clip_param=policy.config["clip_param"],
diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py
index f08b23e93..a0d48d2ef 100644
--- a/python/ray/rllib/agents/trainer.py
+++ b/python/ray/rllib/agents/trainer.py
@@ -67,6 +67,9 @@ COMMON_CONFIG = {
     },
     # Whether to attempt to continue training if a worker crashes.
     "ignore_worker_failures": False,
+    # Execute TF loss functions in eager mode. This is currently experimental
+    # and only really works with the basic PG algorithm.
+    "use_eager": False,
 
     # === Policy ===
     # Arguments to pass to model. See models/catalog.py for a full list of the
diff --git a/python/ray/rllib/examples/eager_execution.py b/python/ray/rllib/examples/eager_execution.py
new file mode 100644
index 000000000..a3c418a33
--- /dev/null
+++ b/python/ray/rllib/examples/eager_execution.py
@@ -0,0 +1,101 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import random
+
+import ray
+from ray import tune
+from ray.rllib.agents.trainer_template import build_trainer
+from ray.rllib.models import FullyConnectedNetwork, Model, ModelCatalog
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--iters", type=int, default=200)
+
+
+class EagerModel(Model):
+    """Example of using embedded eager execution in a custom model.
+
+    This shows how to use tf.py_function() to execute a snippet of TF code
+    in eager mode. Here the `self.forward_eager` method just prints out
+    the intermediate tensor for debug purposes, but you can in general
+    perform any TF eager operation in tf.py_function().
+    """
+
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
+                                           self.action_space, num_outputs,
+                                           options)
+        feature_out = tf.py_function(self.forward_eager,
+                                     [self.fcnet.last_layer], tf.float32)
+
+        with tf.control_dependencies([feature_out]):
+            return tf.identity(self.fcnet.outputs), feature_out
+
+    def forward_eager(self, feature_layer):
+        assert tf.executing_eagerly()
+        if random.random() > 0.99:
+            print("Eagerly printing the feature layer mean value",
+                  tf.reduce_mean(feature_layer))
+        return feature_layer
+
+
+def policy_gradient_loss(policy, batch_tensors):
+    """Example of using embedded eager execution in a custom loss.
+
+    Here `compute_penalty` prints the actions and rewards for debugging, and
+    also computes a (dummy) penalty term to add to the loss.
+
+    Alternatively, you can set config["use_eager"] = True, which will try to
+    automatically eagerify the entire loss function. However, this only works
+    if your loss doesn't reference any non-eager tensors. It also won't work
+    with the multi-GPU optimizer used by PPO.
+    """
+
+    def compute_penalty(actions, rewards):
+        assert tf.executing_eagerly()
+        penalty = tf.reduce_mean(tf.cast(actions, tf.float32))
+        if random.random() > 0.9:
+            print("The eagerly computed penalty is", penalty, actions, rewards)
+        return penalty
+
+    actions = batch_tensors[SampleBatch.ACTIONS]
+    rewards = batch_tensors[SampleBatch.REWARDS]
+    penalty = tf.py_function(
+        compute_penalty, [actions, rewards], Tout=tf.float32)
+
+    return penalty - tf.reduce_mean(policy.action_dist.logp(actions) * rewards)
+
+
+# <class 'ray.rllib.policy.tf_policy_template.MyTFPolicy'>
+MyTFPolicy = build_tf_policy(
+    name="MyTFPolicy",
+    loss_fn=policy_gradient_loss,
+)
+
+# <class 'ray.rllib.agents.trainer_template.MyCustomTrainer'>
+MyTrainer = build_trainer(
+    name="MyCustomTrainer",
+    default_policy=MyTFPolicy,
+)
+
+if __name__ == "__main__":
+    ray.init()
+    args = parser.parse_args()
+    ModelCatalog.register_custom_model("eager_model", EagerModel)
+    tune.run(
+        MyTrainer,
+        stop={"training_iteration": args.iters},
+        config={
+            "env": "CartPole-v0",
+            "num_workers": 0,
+            "model": {
+                "custom_model": "eager_model"
+            },
+        })
diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py
index 0240f275d..23014553b 100644
--- a/python/ray/rllib/policy/dynamic_tf_policy.py
+++ b/python/ray/rllib/policy/dynamic_tf_policy.py
@@ -167,6 +167,8 @@ class DynamicTFPolicy(TFPolicy):
             batch_divisibility_req=batch_divisibility_req)
 
         # Phase 2 init
+        self._needs_eager_conversion = set()
+        self._eager_tensors = {}
         before_loss_init(self, obs_space, action_space, config)
         if not existing_inputs:
             self._initialize_loss()
@@ -178,10 +180,26 @@ class DynamicTFPolicy(TFPolicy):
         """
         return self.input_dict
 
+    def convert_to_eager(self, tensor):
+        """Convert a graph tensor accessed in the loss to an eager tensor.
+
+        Experimental.
+        """
+        if tf.executing_eagerly():
+            return self._eager_tensors[tensor]
+        else:
+            self._needs_eager_conversion.add(tensor)
+            return tensor
+
     @override(TFPolicy)
     def copy(self, existing_inputs):
         """Creates a copy of self using existing input placeholders."""
 
+        if self.config["use_eager"]:
+            raise ValueError(
+                "eager not implemented for multi-GPU, try setting "
+                "`simple_optimizer: true`")
+
         # Note that there might be RNN state inputs at the end of the list
         if self._state_inputs:
             num_state_inputs = len(self._state_inputs) + 1
@@ -297,6 +315,38 @@ class DynamicTFPolicy(TFPolicy):
         loss = self._do_loss_init(batch_tensors)
         for k in sorted(batch_tensors.accessed_keys):
             loss_inputs.append((k, batch_tensors[k]))
+
+        # XXX experimental support for automatically eagerifying the loss.
+        # The main limitation right now is that TF doesn't support mixing eager
+        # and non-eager tensors, so losses that read non-eager tensors through
+        # `policy` need to use `policy.convert_to_eager(tensor)`.
+        if self.config["use_eager"]:
+            if not self.model:
+                raise ValueError("eager not implemented in this case")
+            graph_tensors = list(self._needs_eager_conversion)
+
+            def gen_loss(model_outputs, *args):
+                # fill in the batch tensor dict with eager ensors
+                eager_inputs = dict(
+                    zip([k for (k, v) in loss_inputs],
+                        args[:len(loss_inputs)]))
+                # fill in the eager versions of all accessed graph tensors
+                self._eager_tensors = dict(
+                    zip(graph_tensors, args[len(loss_inputs):]))
+                # patch the action dist to use eager mode tensors
+                self.action_dist.inputs = model_outputs
+                return self._loss_fn(self, eager_inputs)
+
+            # TODO(ekl) also handle the stats funcs
+            loss = tf.py_function(
+                gen_loss,
+                # cast works around TypeError: Cannot convert provided value
+                # to EagerTensor. Provided value: 0.0 Requested dtype: int64
+                [self.model.outputs] + [
+                    tf.cast(v, tf.float32) for (k, v) in loss_inputs
+                ] + [tf.cast(t, tf.float32) for t in graph_tensors],
+                tf.float32)
+
         TFPolicy._initialize_loss(self, loss, loss_inputs)
         if self._grad_stats_fn:
             self._stats_fetches.update(self._grad_stats_fn(self, self._grads))