From f9043cc49abbfa6a3caf4a5b50d87c7a7b19c572 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sun, 21 Jul 2019 12:27:17 -0700 Subject: [PATCH] [rllib] Remove experimental eager support --- ci/jenkins_tests/run_rllib_tests.sh | 21 ------ doc/source/rllib-concepts.rst | 2 - python/ray/rllib/agents/a3c/a3c_tf_policy.py | 10 ++- python/ray/rllib/agents/ppo/ppo_policy.py | 21 +++--- python/ray/rllib/agents/trainer.py | 4 +- python/ray/rllib/examples/eager_execution.py | 5 -- python/ray/rllib/policy/dynamic_tf_policy.py | 65 +------------------ python/ray/rllib/policy/tf_policy_template.py | 2 +- 8 files changed, 16 insertions(+), 114 deletions(-) diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 5f433e2b2..7e235c948 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -407,27 +407,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/eager_execution.py --iters=2 -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output /ray/python/ray/rllib/train.py \ - --env CartPole-v0 \ - --run PG \ - --stop '{"training_iteration": 1}' \ - --config '{"use_eager": true}' - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output /ray/python/ray/rllib/train.py \ - --env CartPole-v0 \ - --run A2C \ - --stop '{"training_iteration": 1}' \ - --config '{"use_eager": true}' - -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output /ray/python/ray/rllib/train.py \ - --env CartPole-v0 \ - --run PPO \ - --stop '{"training_iteration": 1}' \ - --config '{"use_eager": true}' - docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_tf_policy.py --iters=2 diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index d5b5d8039..a14e42158 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -437,8 +437,6 @@ While RLlib runs all TF operations in graph mode, you can still leverage TensorF You can find a runnable file for the above eager execution example `here `__. -There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true}'``. However this currently only works for PG, A2C, and PPO. - Building Policies in PyTorch ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/python/ray/rllib/agents/a3c/a3c_tf_policy.py b/python/ray/rllib/agents/a3c/a3c_tf_policy.py index 2104d0b90..a614dc2d3 100644 --- a/python/ray/rllib/agents/a3c/a3c_tf_policy.py +++ b/python/ray/rllib/agents/a3c/a3c_tf_policy.py @@ -41,9 +41,8 @@ def actor_critic_loss(policy, batch_tensors): policy.loss = A3CLoss( policy.action_dist, batch_tensors[SampleBatch.ACTIONS], batch_tensors[Postprocessing.ADVANTAGES], - batch_tensors[Postprocessing.VALUE_TARGETS], - policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"], - policy.config["entropy_coeff"]) + batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf, + policy.config["vf_loss_coeff"], policy.config["entropy_coeff"]) return policy.loss.total_loss @@ -91,11 +90,10 @@ class ValueNetworkMixin(object): def stats(policy, batch_tensors): return { - "cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64), + "cur_lr": tf.cast(policy.cur_lr, tf.float64), "policy_loss": policy.loss.pi_loss, "policy_entropy": policy.loss.entropy, - "var_gnorm": tf.global_norm( - [policy.convert_to_eager(x) for x in policy.var_list]), + "var_gnorm": tf.global_norm([x for x in policy.var_list]), "vf_loss": policy.loss.vf_loss, } diff --git a/python/ray/rllib/agents/ppo/ppo_policy.py b/python/ray/rllib/agents/ppo/ppo_policy.py index cf99debf3..e87b106fa 100644 --- a/python/ray/rllib/agents/ppo/ppo_policy.py +++ b/python/ray/rllib/agents/ppo/ppo_policy.py @@ -107,9 +107,8 @@ class PPOLoss(object): def ppo_surrogate_loss(policy, batch_tensors): if policy.state_in: - max_seq_len = tf.reduce_max(policy.convert_to_eager(policy.seq_lens)) - mask = tf.sequence_mask( - policy.convert_to_eager(policy.seq_lens), max_seq_len) + max_seq_len = tf.reduce_max(policy.seq_lens) + mask = tf.sequence_mask(policy.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like( @@ -123,10 +122,10 @@ def ppo_surrogate_loss(policy, batch_tensors): batch_tensors[BEHAVIOUR_LOGITS], batch_tensors[SampleBatch.VF_PREDS], policy.action_dist, - policy.convert_to_eager(policy.value_function), - policy.convert_to_eager(policy.kl_coeff), + policy.value_function, + policy.kl_coeff, mask, - entropy_coeff=policy.convert_to_eager(policy.entropy_coeff), + entropy_coeff=policy.entropy_coeff, clip_param=policy.config["clip_param"], vf_clip_param=policy.config["vf_clip_param"], vf_loss_coeff=policy.config["vf_loss_coeff"], @@ -137,19 +136,17 @@ def ppo_surrogate_loss(policy, batch_tensors): def kl_and_loss_stats(policy, batch_tensors): return { - "cur_kl_coeff": tf.cast( - policy.convert_to_eager(policy.kl_coeff), tf.float64), - "cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64), + "cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64), + "cur_lr": tf.cast(policy.cur_lr, tf.float64), "total_loss": policy.loss_obj.loss, "policy_loss": policy.loss_obj.mean_policy_loss, "vf_loss": policy.loss_obj.mean_vf_loss, "vf_explained_var": explained_variance( batch_tensors[Postprocessing.VALUE_TARGETS], - policy.convert_to_eager(policy.value_function)), + policy.value_function), "kl": policy.loss_obj.mean_kl, "entropy": policy.loss_obj.mean_entropy, - "entropy_coeff": tf.cast( - policy.convert_to_eager(policy.entropy_coeff), tf.float64), + "entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64), } diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py index a95fb1721..5bd56910f 100644 --- a/python/ray/rllib/agents/trainer.py +++ b/python/ray/rllib/agents/trainer.py @@ -67,9 +67,7 @@ COMMON_CONFIG = { }, # Whether to attempt to continue training if a worker crashes. "ignore_worker_failures": False, - # Execute TF loss functions in eager mode. This is currently experimental - # and only really works with the basic PG algorithm. - "use_eager": False, + # Log system resource metrics to results. "log_sys_usage": True, # === Policy === diff --git a/python/ray/rllib/examples/eager_execution.py b/python/ray/rllib/examples/eager_execution.py index a3c418a33..ffcc0989c 100644 --- a/python/ray/rllib/examples/eager_execution.py +++ b/python/ray/rllib/examples/eager_execution.py @@ -51,11 +51,6 @@ def policy_gradient_loss(policy, batch_tensors): Here `compute_penalty` prints the actions and rewards for debugging, and also computes a (dummy) penalty term to add to the loss. - - Alternatively, you can set config["use_eager"] = True, which will try to - automatically eagerify the entire loss function. However, this only works - if your loss doesn't reference any non-eager tensors. It also won't work - with the multi-GPU optimizer used by PPO. """ def compute_penalty(actions, rewards): diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py index c66fb7f53..256fb1a6c 100644 --- a/python/ray/rllib/policy/dynamic_tf_policy.py +++ b/python/ray/rllib/policy/dynamic_tf_policy.py @@ -23,9 +23,6 @@ logger = logging.getLogger(__name__) class DynamicTFPolicy(TFPolicy): """A TFPolicy that auto-defines placeholders dynamically at runtime. - This class also supports eager execution if config["use_eager"] is True. - Eager execution is implemented using a py_function op inside graph mode. - Initialization of this class occurs in two phases. * Phase 1: the model is created and model variables are initialized. * Phase 2: a fake batch of data is created, sent to the trajectory @@ -33,9 +30,7 @@ class DynamicTFPolicy(TFPolicy): function. The loss and stats functions are initialized with these placeholders. - Initialization defines the static graph. When using eager execution, a - corresponding imperative py_function is also generated as an embedded op - inside the static graph. + Initialization defines the static graph. """ def __init__(self, @@ -198,8 +193,6 @@ class DynamicTFPolicy(TFPolicy): batch_divisibility_req=batch_divisibility_req) # Phase 2 init - self._needs_eager_conversion = set() - self._eager_tensors = {} before_loss_init(self, obs_space, action_space, config) if not existing_inputs: self._initialize_loss() @@ -211,17 +204,6 @@ class DynamicTFPolicy(TFPolicy): """ return self.input_dict - def convert_to_eager(self, tensor): - """Convert a graph tensor accessed in the loss to an eager tensor. - - Experimental. - """ - if tf.executing_eagerly(): - return self._eager_tensors[tensor] - else: - self._needs_eager_conversion.add(tensor) - return tensor - @override(TFPolicy) def copy(self, existing_inputs): """Creates a copy of self using existing input placeholders.""" @@ -260,10 +242,6 @@ class DynamicTFPolicy(TFPolicy): loss_inputs = [(k, existing_inputs[i]) for i, (k, _) in enumerate(self._loss_inputs)] - if self.config["use_eager"]: - loss, new_stats = instance._gen_eager_loss_op(loss_inputs) - instance._stats_fetches = new_stats - TFPolicy._initialize_loss(instance, loss, loss_inputs) if instance._grad_stats_fn: instance._stats_fetches.update( @@ -348,14 +326,6 @@ class DynamicTFPolicy(TFPolicy): for k in sorted(batch_tensors.accessed_keys): loss_inputs.append((k, batch_tensors[k])) - # XXX experimental support for automatically eagerifying the loss. - # The main limitation right now is that TF doesn't support mixing eager - # and non-eager tensors, so losses that read non-eager tensors through - # `policy` need to use `policy.convert_to_eager(tensor)`. - if self.config["use_eager"]: - loss, new_stats = self._gen_eager_loss_op(loss_inputs) - self._stats_fetches = new_stats - TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update(self._grad_stats_fn(self, self._grads)) @@ -368,36 +338,3 @@ class DynamicTFPolicy(TFPolicy): if self._update_ops_fn: self._update_ops = self._update_ops_fn(self) return loss - - def _gen_eager_loss_op(self, loss_inputs): - graph_tensors = list(self._needs_eager_conversion) - stat_items = list(self._stats_fetches.items()) - - def gen_loss(model_outputs, *args): - # fill in the batch tensor dict with eager ensors - eager_inputs = dict( - zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)])) - # fill in the eager versions of all accessed graph tensors - self._eager_tensors = dict( - zip(graph_tensors, args[len(loss_inputs):])) - # patch the action dist to use eager mode tensors - self.action_dist.inputs = model_outputs - loss = self._loss_fn(self, eager_inputs) - if self._stats_fn: - stats = self._stats_fn(self, eager_inputs) - return [loss] + [stats[k] for (k, v) in stat_items] - - eager_out = tf.py_function( - gen_loss, - # cast works around TypeError: Cannot convert provided value - # to EagerTensor. Provided value: 0.0 Requested dtype: int64 - [self.model_out] + [ - tf.cast(v, tf.float32) for (k, v) in loss_inputs - ] + [tf.cast(t, tf.float32) for t in graph_tensors], - Tout=[tf.float32] + [v.dtype for (k, v) in stat_items]) - stats = { - k: stat_tensor - for (stat_tensor, (k, v)) in zip(eager_out[1:], stat_items) - } - - return eager_out[0], stats diff --git a/python/ray/rllib/policy/tf_policy_template.py b/python/ray/rllib/policy/tf_policy_template.py index fde6cefb3..c75e618e0 100644 --- a/python/ray/rllib/policy/tf_policy_template.py +++ b/python/ray/rllib/policy/tf_policy_template.py @@ -41,7 +41,7 @@ def build_tf_policy(name, This means that you can e.g., depend on any policy attributes created in the running of `loss_fn` in later functions such as `stats_fn`. - In eager mode (experimental), the following functions will be run + In eager mode (to be implemented), the following functions will be run repeatedly on each eager execution: loss_fn, stats_fn This means that these functions should not define any variables internally,