[rllib] Remove experimental eager support

2026-06-30 19:08:52 +08:00 · 2019-07-21 12:27:17 -07:00
parent b0c0de49a2
commit f9043cc49a
8 changed files with 16 additions and 114 deletions
@@ -41,9 +41,8 @@ def actor_critic_loss(policy, batch_tensors):
    policy.loss = A3CLoss(
        policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
        batch_tensors[Postprocessing.ADVANTAGES],
-        batch_tensors[Postprocessing.VALUE_TARGETS],
-        policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"],
-        policy.config["entropy_coeff"])
+        batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
+        policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
    return policy.loss.total_loss


@@ -91,11 +90,10 @@ class ValueNetworkMixin(object):

 def stats(policy, batch_tensors):
    return {
-        "cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
+        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
        "policy_loss": policy.loss.pi_loss,
        "policy_entropy": policy.loss.entropy,
-        "var_gnorm": tf.global_norm(
-            [policy.convert_to_eager(x) for x in policy.var_list]),
+        "var_gnorm": tf.global_norm([x for x in policy.var_list]),
        "vf_loss": policy.loss.vf_loss,
    }

@@ -107,9 +107,8 @@ class PPOLoss(object):

 def ppo_surrogate_loss(policy, batch_tensors):
    if policy.state_in:
-        max_seq_len = tf.reduce_max(policy.convert_to_eager(policy.seq_lens))
-        mask = tf.sequence_mask(
-            policy.convert_to_eager(policy.seq_lens), max_seq_len)
+        max_seq_len = tf.reduce_max(policy.seq_lens)
+        mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
        mask = tf.reshape(mask, [-1])
    else:
        mask = tf.ones_like(
@@ -123,10 +122,10 @@ def ppo_surrogate_loss(policy, batch_tensors):
        batch_tensors[BEHAVIOUR_LOGITS],
        batch_tensors[SampleBatch.VF_PREDS],
        policy.action_dist,
-        policy.convert_to_eager(policy.value_function),
-        policy.convert_to_eager(policy.kl_coeff),
+        policy.value_function,
+        policy.kl_coeff,
        mask,
-        entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
+        entropy_coeff=policy.entropy_coeff,
        clip_param=policy.config["clip_param"],
        vf_clip_param=policy.config["vf_clip_param"],
        vf_loss_coeff=policy.config["vf_loss_coeff"],
@@ -137,19 +136,17 @@ def ppo_surrogate_loss(policy, batch_tensors):

 def kl_and_loss_stats(policy, batch_tensors):
    return {
-        "cur_kl_coeff": tf.cast(
-            policy.convert_to_eager(policy.kl_coeff), tf.float64),
-        "cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
+        "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
+        "cur_lr": tf.cast(policy.cur_lr, tf.float64),
        "total_loss": policy.loss_obj.loss,
        "policy_loss": policy.loss_obj.mean_policy_loss,
        "vf_loss": policy.loss_obj.mean_vf_loss,
        "vf_explained_var": explained_variance(
            batch_tensors[Postprocessing.VALUE_TARGETS],
-            policy.convert_to_eager(policy.value_function)),
+            policy.value_function),
        "kl": policy.loss_obj.mean_kl,
        "entropy": policy.loss_obj.mean_entropy,
-        "entropy_coeff": tf.cast(
-            policy.convert_to_eager(policy.entropy_coeff), tf.float64),
+        "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
    }


@@ -67,9 +67,7 @@ COMMON_CONFIG = {
    },
    # Whether to attempt to continue training if a worker crashes.
    "ignore_worker_failures": False,
-    # Execute TF loss functions in eager mode. This is currently experimental
-    # and only really works with the basic PG algorithm.
-    "use_eager": False,
+    # Log system resource metrics to results.
    "log_sys_usage": True,

    # === Policy ===
@@ -51,11 +51,6 @@ def policy_gradient_loss(policy, batch_tensors):

    Here `compute_penalty` prints the actions and rewards for debugging, and
    also computes a (dummy) penalty term to add to the loss.
-
-    Alternatively, you can set config["use_eager"] = True, which will try to
-    automatically eagerify the entire loss function. However, this only works
-    if your loss doesn't reference any non-eager tensors. It also won't work
-    with the multi-GPU optimizer used by PPO.
    """

    def compute_penalty(actions, rewards):
@@ -23,9 +23,6 @@ logger = logging.getLogger(__name__)
 class DynamicTFPolicy(TFPolicy):
    """A TFPolicy that auto-defines placeholders dynamically at runtime.

-    This class also supports eager execution if config["use_eager"] is True.
-    Eager execution is implemented using a py_function op inside graph mode.
-
    Initialization of this class occurs in two phases.
      * Phase 1: the model is created and model variables are initialized.
      * Phase 2: a fake batch of data is created, sent to the trajectory
@@ -33,9 +30,7 @@ class DynamicTFPolicy(TFPolicy):
        function. The loss and stats functions are initialized with these
        placeholders.

-    Initialization defines the static graph. When using eager execution, a
-    corresponding imperative py_function is also generated as an embedded op
-    inside the static graph.
+    Initialization defines the static graph.
    """

    def __init__(self,
@@ -198,8 +193,6 @@ class DynamicTFPolicy(TFPolicy):
            batch_divisibility_req=batch_divisibility_req)

        # Phase 2 init
-        self._needs_eager_conversion = set()
-        self._eager_tensors = {}
        before_loss_init(self, obs_space, action_space, config)
        if not existing_inputs:
            self._initialize_loss()
@@ -211,17 +204,6 @@ class DynamicTFPolicy(TFPolicy):
        """
        return self.input_dict

-    def convert_to_eager(self, tensor):
-        """Convert a graph tensor accessed in the loss to an eager tensor.
-
-        Experimental.
-        """
-        if tf.executing_eagerly():
-            return self._eager_tensors[tensor]
-        else:
-            self._needs_eager_conversion.add(tensor)
-            return tensor
-
    @override(TFPolicy)
    def copy(self, existing_inputs):
        """Creates a copy of self using existing input placeholders."""
@@ -260,10 +242,6 @@ class DynamicTFPolicy(TFPolicy):
        loss_inputs = [(k, existing_inputs[i])
                       for i, (k, _) in enumerate(self._loss_inputs)]

-        if self.config["use_eager"]:
-            loss, new_stats = instance._gen_eager_loss_op(loss_inputs)
-            instance._stats_fetches = new_stats
-
        TFPolicy._initialize_loss(instance, loss, loss_inputs)
        if instance._grad_stats_fn:
            instance._stats_fetches.update(
@@ -348,14 +326,6 @@ class DynamicTFPolicy(TFPolicy):
        for k in sorted(batch_tensors.accessed_keys):
            loss_inputs.append((k, batch_tensors[k]))

-        # XXX experimental support for automatically eagerifying the loss.
-        # The main limitation right now is that TF doesn't support mixing eager
-        # and non-eager tensors, so losses that read non-eager tensors through
-        # `policy` need to use `policy.convert_to_eager(tensor)`.
-        if self.config["use_eager"]:
-            loss, new_stats = self._gen_eager_loss_op(loss_inputs)
-            self._stats_fetches = new_stats
-
        TFPolicy._initialize_loss(self, loss, loss_inputs)
        if self._grad_stats_fn:
            self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
@@ -368,36 +338,3 @@ class DynamicTFPolicy(TFPolicy):
        if self._update_ops_fn:
            self._update_ops = self._update_ops_fn(self)
        return loss
-
-    def _gen_eager_loss_op(self, loss_inputs):
-        graph_tensors = list(self._needs_eager_conversion)
-        stat_items = list(self._stats_fetches.items())
-
-        def gen_loss(model_outputs, *args):
-            # fill in the batch tensor dict with eager ensors
-            eager_inputs = dict(
-                zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)]))
-            # fill in the eager versions of all accessed graph tensors
-            self._eager_tensors = dict(
-                zip(graph_tensors, args[len(loss_inputs):]))
-            # patch the action dist to use eager mode tensors
-            self.action_dist.inputs = model_outputs
-            loss = self._loss_fn(self, eager_inputs)
-            if self._stats_fn:
-                stats = self._stats_fn(self, eager_inputs)
-            return [loss] + [stats[k] for (k, v) in stat_items]
-
-        eager_out = tf.py_function(
-            gen_loss,
-            # cast works around TypeError: Cannot convert provided value
-            # to EagerTensor. Provided value: 0.0 Requested dtype: int64
-            [self.model_out] + [
-                tf.cast(v, tf.float32) for (k, v) in loss_inputs
-            ] + [tf.cast(t, tf.float32) for t in graph_tensors],
-            Tout=[tf.float32] + [v.dtype for (k, v) in stat_items])
-        stats = {
-            k: stat_tensor
-            for (stat_tensor, (k, v)) in zip(eager_out[1:], stat_items)
-        }
-
-        return eager_out[0], stats
@@ -41,7 +41,7 @@ def build_tf_policy(name,
    This means that you can e.g., depend on any policy attributes created in
    the running of `loss_fn` in later functions such as `stats_fn`.

-    In eager mode (experimental), the following functions will be run
+    In eager mode (to be implemented), the following functions will be run
    repeatedly on each eager execution: loss_fn, stats_fn

    This means that these functions should not define any variables internally,