From f9043cc49abbfa6a3caf4a5b50d87c7a7b19c572 Mon Sep 17 00:00:00 2001
From: Eric Liang <ekhliang@gmail.com>
Date: Sun, 21 Jul 2019 12:27:17 -0700
Subject: [PATCH] [rllib] Remove experimental eager support

---
 ci/jenkins_tests/run_rllib_tests.sh           | 21 ------
 doc/source/rllib-concepts.rst                 |  2 -
 python/ray/rllib/agents/a3c/a3c_tf_policy.py  | 10 ++-
 python/ray/rllib/agents/ppo/ppo_policy.py     | 21 +++---
 python/ray/rllib/agents/trainer.py            |  4 +-
 python/ray/rllib/examples/eager_execution.py  |  5 --
 python/ray/rllib/policy/dynamic_tf_policy.py  | 65 +------------------
 python/ray/rllib/policy/tf_policy_template.py |  2 +-
 8 files changed, 16 insertions(+), 114 deletions(-)

diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh
index 5f433e2b2..7e235c948 100644
--- a/ci/jenkins_tests/run_rllib_tests.sh
+++ b/ci/jenkins_tests/run_rllib_tests.sh
@@ -407,27 +407,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/examples/eager_execution.py --iters=2
 
-docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output /ray/python/ray/rllib/train.py \
-    --env CartPole-v0 \
-    --run PG \
-    --stop '{"training_iteration": 1}' \
-    --config '{"use_eager": true}'
-
-docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output /ray/python/ray/rllib/train.py \
-    --env CartPole-v0 \
-    --run A2C \
-    --stop '{"training_iteration": 1}' \
-    --config '{"use_eager": true}'
-
-docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output /ray/python/ray/rllib/train.py \
-    --env CartPole-v0 \
-    --run PPO \
-    --stop '{"training_iteration": 1}' \
-    --config '{"use_eager": true}'
-
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_tf_policy.py --iters=2
 
diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst
index d5b5d8039..a14e42158 100644
--- a/doc/source/rllib-concepts.rst
+++ b/doc/source/rllib-concepts.rst
@@ -437,8 +437,6 @@ While RLlib runs all TF operations in graph mode, you can still leverage TensorF
 
 You can find a runnable file for the above eager execution example `here <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/eager_execution.py>`__.
 
-There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true}'``. However this currently only works for PG, A2C, and PPO.
-
 Building Policies in PyTorch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
index 2104d0b90..a614dc2d3 100644
--- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py
+++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py
@@ -41,9 +41,8 @@ def actor_critic_loss(policy, batch_tensors):
     policy.loss = A3CLoss(
         policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
         batch_tensors[Postprocessing.ADVANTAGES],
-        batch_tensors[Postprocessing.VALUE_TARGETS],
-        policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"],
-        policy.config["entropy_coeff"])
+        batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
+        policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
     return policy.loss.total_loss
 
 
@@ -91,11 +90,10 @@ class ValueNetworkMixin(object):
 
 def stats(policy, batch_tensors):
     return {
-        "cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
+        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
         "policy_loss": policy.loss.pi_loss,
         "policy_entropy": policy.loss.entropy,
-        "var_gnorm": tf.global_norm(
-            [policy.convert_to_eager(x) for x in policy.var_list]),
+        "var_gnorm": tf.global_norm([x for x in policy.var_list]),
         "vf_loss": policy.loss.vf_loss,
     }
 
diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py
index cf99debf3..e87b106fa 100644
--- a/python/ray/rllib/agents/ppo/ppo_policy.py
+++ b/python/ray/rllib/agents/ppo/ppo_policy.py
@@ -107,9 +107,8 @@ class PPOLoss(object):
 
 def ppo_surrogate_loss(policy, batch_tensors):
     if policy.state_in:
-        max_seq_len = tf.reduce_max(policy.convert_to_eager(policy.seq_lens))
-        mask = tf.sequence_mask(
-            policy.convert_to_eager(policy.seq_lens), max_seq_len)
+        max_seq_len = tf.reduce_max(policy.seq_lens)
+        mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
         mask = tf.reshape(mask, [-1])
     else:
         mask = tf.ones_like(
@@ -123,10 +122,10 @@ def ppo_surrogate_loss(policy, batch_tensors):
         batch_tensors[BEHAVIOUR_LOGITS],
         batch_tensors[SampleBatch.VF_PREDS],
         policy.action_dist,
-        policy.convert_to_eager(policy.value_function),
-        policy.convert_to_eager(policy.kl_coeff),
+        policy.value_function,
+        policy.kl_coeff,
         mask,
-        entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
+        entropy_coeff=policy.entropy_coeff,
         clip_param=policy.config["clip_param"],
         vf_clip_param=policy.config["vf_clip_param"],
         vf_loss_coeff=policy.config["vf_loss_coeff"],
@@ -137,19 +136,17 @@ def ppo_surrogate_loss(policy, batch_tensors):
 
 def kl_and_loss_stats(policy, batch_tensors):
     return {
-        "cur_kl_coeff": tf.cast(
-            policy.convert_to_eager(policy.kl_coeff), tf.float64),
-        "cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
+        "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
+        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
         "total_loss": policy.loss_obj.loss,
         "policy_loss": policy.loss_obj.mean_policy_loss,
         "vf_loss": policy.loss_obj.mean_vf_loss,
         "vf_explained_var": explained_variance(
             batch_tensors[Postprocessing.VALUE_TARGETS],
-            policy.convert_to_eager(policy.value_function)),
+            policy.value_function),
         "kl": policy.loss_obj.mean_kl,
         "entropy": policy.loss_obj.mean_entropy,
-        "entropy_coeff": tf.cast(
-            policy.convert_to_eager(policy.entropy_coeff), tf.float64),
+        "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
     }
 
 
diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py
index a95fb1721..5bd56910f 100644
--- a/python/ray/rllib/agents/trainer.py
+++ b/python/ray/rllib/agents/trainer.py
@@ -67,9 +67,7 @@ COMMON_CONFIG = {
     },
     # Whether to attempt to continue training if a worker crashes.
     "ignore_worker_failures": False,
-    # Execute TF loss functions in eager mode. This is currently experimental
-    # and only really works with the basic PG algorithm.
-    "use_eager": False,
+    # Log system resource metrics to results.
     "log_sys_usage": True,
 
     # === Policy ===
diff --git a/python/ray/rllib/examples/eager_execution.py b/python/ray/rllib/examples/eager_execution.py
index a3c418a33..ffcc0989c 100644
--- a/python/ray/rllib/examples/eager_execution.py
+++ b/python/ray/rllib/examples/eager_execution.py
@@ -51,11 +51,6 @@ def policy_gradient_loss(policy, batch_tensors):
 
     Here `compute_penalty` prints the actions and rewards for debugging, and
     also computes a (dummy) penalty term to add to the loss.
-
-    Alternatively, you can set config["use_eager"] = True, which will try to
-    automatically eagerify the entire loss function. However, this only works
-    if your loss doesn't reference any non-eager tensors. It also won't work
-    with the multi-GPU optimizer used by PPO.
     """
 
     def compute_penalty(actions, rewards):
diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py
index c66fb7f53..256fb1a6c 100644
--- a/python/ray/rllib/policy/dynamic_tf_policy.py
+++ b/python/ray/rllib/policy/dynamic_tf_policy.py
@@ -23,9 +23,6 @@ logger = logging.getLogger(__name__)
 class DynamicTFPolicy(TFPolicy):
     """A TFPolicy that auto-defines placeholders dynamically at runtime.
 
-    This class also supports eager execution if config["use_eager"] is True.
-    Eager execution is implemented using a py_function op inside graph mode.
-
     Initialization of this class occurs in two phases.
       * Phase 1: the model is created and model variables are initialized.
       * Phase 2: a fake batch of data is created, sent to the trajectory
@@ -33,9 +30,7 @@ class DynamicTFPolicy(TFPolicy):
         function. The loss and stats functions are initialized with these
         placeholders.
 
-    Initialization defines the static graph. When using eager execution, a
-    corresponding imperative py_function is also generated as an embedded op
-    inside the static graph.
+    Initialization defines the static graph.
     """
 
     def __init__(self,
@@ -198,8 +193,6 @@ class DynamicTFPolicy(TFPolicy):
             batch_divisibility_req=batch_divisibility_req)
 
         # Phase 2 init
-        self._needs_eager_conversion = set()
-        self._eager_tensors = {}
         before_loss_init(self, obs_space, action_space, config)
         if not existing_inputs:
             self._initialize_loss()
@@ -211,17 +204,6 @@ class DynamicTFPolicy(TFPolicy):
         """
         return self.input_dict
 
-    def convert_to_eager(self, tensor):
-        """Convert a graph tensor accessed in the loss to an eager tensor.
-
-        Experimental.
-        """
-        if tf.executing_eagerly():
-            return self._eager_tensors[tensor]
-        else:
-            self._needs_eager_conversion.add(tensor)
-            return tensor
-
     @override(TFPolicy)
     def copy(self, existing_inputs):
         """Creates a copy of self using existing input placeholders."""
@@ -260,10 +242,6 @@ class DynamicTFPolicy(TFPolicy):
         loss_inputs = [(k, existing_inputs[i])
                        for i, (k, _) in enumerate(self._loss_inputs)]
 
-        if self.config["use_eager"]:
-            loss, new_stats = instance._gen_eager_loss_op(loss_inputs)
-            instance._stats_fetches = new_stats
-
         TFPolicy._initialize_loss(instance, loss, loss_inputs)
         if instance._grad_stats_fn:
             instance._stats_fetches.update(
@@ -348,14 +326,6 @@ class DynamicTFPolicy(TFPolicy):
         for k in sorted(batch_tensors.accessed_keys):
             loss_inputs.append((k, batch_tensors[k]))
 
-        # XXX experimental support for automatically eagerifying the loss.
-        # The main limitation right now is that TF doesn't support mixing eager
-        # and non-eager tensors, so losses that read non-eager tensors through
-        # `policy` need to use `policy.convert_to_eager(tensor)`.
-        if self.config["use_eager"]:
-            loss, new_stats = self._gen_eager_loss_op(loss_inputs)
-            self._stats_fetches = new_stats
-
         TFPolicy._initialize_loss(self, loss, loss_inputs)
         if self._grad_stats_fn:
             self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
@@ -368,36 +338,3 @@ class DynamicTFPolicy(TFPolicy):
         if self._update_ops_fn:
             self._update_ops = self._update_ops_fn(self)
         return loss
-
-    def _gen_eager_loss_op(self, loss_inputs):
-        graph_tensors = list(self._needs_eager_conversion)
-        stat_items = list(self._stats_fetches.items())
-
-        def gen_loss(model_outputs, *args):
-            # fill in the batch tensor dict with eager ensors
-            eager_inputs = dict(
-                zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)]))
-            # fill in the eager versions of all accessed graph tensors
-            self._eager_tensors = dict(
-                zip(graph_tensors, args[len(loss_inputs):]))
-            # patch the action dist to use eager mode tensors
-            self.action_dist.inputs = model_outputs
-            loss = self._loss_fn(self, eager_inputs)
-            if self._stats_fn:
-                stats = self._stats_fn(self, eager_inputs)
-            return [loss] + [stats[k] for (k, v) in stat_items]
-
-        eager_out = tf.py_function(
-            gen_loss,
-            # cast works around TypeError: Cannot convert provided value
-            # to EagerTensor. Provided value: 0.0 Requested dtype: int64
-            [self.model_out] + [
-                tf.cast(v, tf.float32) for (k, v) in loss_inputs
-            ] + [tf.cast(t, tf.float32) for t in graph_tensors],
-            Tout=[tf.float32] + [v.dtype for (k, v) in stat_items])
-        stats = {
-            k: stat_tensor
-            for (stat_tensor, (k, v)) in zip(eager_out[1:], stat_items)
-        }
-
-        return eager_out[0], stats
diff --git a/python/ray/rllib/policy/tf_policy_template.py b/python/ray/rllib/policy/tf_policy_template.py
index fde6cefb3..c75e618e0 100644
--- a/python/ray/rllib/policy/tf_policy_template.py
+++ b/python/ray/rllib/policy/tf_policy_template.py
@@ -41,7 +41,7 @@ def build_tf_policy(name,
     This means that you can e.g., depend on any policy attributes created in
     the running of `loss_fn` in later functions such as `stats_fn`.
 
-    In eager mode (experimental), the following functions will be run
+    In eager mode (to be implemented), the following functions will be run
     repeatedly on each eager execution: loss_fn, stats_fn
 
     This means that these functions should not define any variables internally,