[rllib] Remove experimental eager support

This commit is contained in:
Eric Liang
2019-07-21 12:27:17 -07:00
committed by GitHub
parent b0c0de49a2
commit f9043cc49a
8 changed files with 16 additions and 114 deletions
+4 -6
View File
@@ -41,9 +41,8 @@ def actor_critic_loss(policy, batch_tensors):
policy.loss = A3CLoss(
policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
batch_tensors[Postprocessing.ADVANTAGES],
batch_tensors[Postprocessing.VALUE_TARGETS],
policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"],
policy.config["entropy_coeff"])
batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
return policy.loss.total_loss
@@ -91,11 +90,10 @@ class ValueNetworkMixin(object):
def stats(policy, batch_tensors):
return {
"cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"policy_entropy": policy.loss.entropy,
"var_gnorm": tf.global_norm(
[policy.convert_to_eager(x) for x in policy.var_list]),
"var_gnorm": tf.global_norm([x for x in policy.var_list]),
"vf_loss": policy.loss.vf_loss,
}
+9 -12
View File
@@ -107,9 +107,8 @@ class PPOLoss(object):
def ppo_surrogate_loss(policy, batch_tensors):
if policy.state_in:
max_seq_len = tf.reduce_max(policy.convert_to_eager(policy.seq_lens))
mask = tf.sequence_mask(
policy.convert_to_eager(policy.seq_lens), max_seq_len)
max_seq_len = tf.reduce_max(policy.seq_lens)
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
mask = tf.reshape(mask, [-1])
else:
mask = tf.ones_like(
@@ -123,10 +122,10 @@ def ppo_surrogate_loss(policy, batch_tensors):
batch_tensors[BEHAVIOUR_LOGITS],
batch_tensors[SampleBatch.VF_PREDS],
policy.action_dist,
policy.convert_to_eager(policy.value_function),
policy.convert_to_eager(policy.kl_coeff),
policy.value_function,
policy.kl_coeff,
mask,
entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
entropy_coeff=policy.entropy_coeff,
clip_param=policy.config["clip_param"],
vf_clip_param=policy.config["vf_clip_param"],
vf_loss_coeff=policy.config["vf_loss_coeff"],
@@ -137,19 +136,17 @@ def ppo_surrogate_loss(policy, batch_tensors):
def kl_and_loss_stats(policy, batch_tensors):
return {
"cur_kl_coeff": tf.cast(
policy.convert_to_eager(policy.kl_coeff), tf.float64),
"cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
"cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"total_loss": policy.loss_obj.loss,
"policy_loss": policy.loss_obj.mean_policy_loss,
"vf_loss": policy.loss_obj.mean_vf_loss,
"vf_explained_var": explained_variance(
batch_tensors[Postprocessing.VALUE_TARGETS],
policy.convert_to_eager(policy.value_function)),
policy.value_function),
"kl": policy.loss_obj.mean_kl,
"entropy": policy.loss_obj.mean_entropy,
"entropy_coeff": tf.cast(
policy.convert_to_eager(policy.entropy_coeff), tf.float64),
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
}
+1 -3
View File
@@ -67,9 +67,7 @@ COMMON_CONFIG = {
},
# Whether to attempt to continue training if a worker crashes.
"ignore_worker_failures": False,
# Execute TF loss functions in eager mode. This is currently experimental
# and only really works with the basic PG algorithm.
"use_eager": False,
# Log system resource metrics to results.
"log_sys_usage": True,
# === Policy ===
@@ -51,11 +51,6 @@ def policy_gradient_loss(policy, batch_tensors):
Here `compute_penalty` prints the actions and rewards for debugging, and
also computes a (dummy) penalty term to add to the loss.
Alternatively, you can set config["use_eager"] = True, which will try to
automatically eagerify the entire loss function. However, this only works
if your loss doesn't reference any non-eager tensors. It also won't work
with the multi-GPU optimizer used by PPO.
"""
def compute_penalty(actions, rewards):
+1 -64
View File
@@ -23,9 +23,6 @@ logger = logging.getLogger(__name__)
class DynamicTFPolicy(TFPolicy):
"""A TFPolicy that auto-defines placeholders dynamically at runtime.
This class also supports eager execution if config["use_eager"] is True.
Eager execution is implemented using a py_function op inside graph mode.
Initialization of this class occurs in two phases.
* Phase 1: the model is created and model variables are initialized.
* Phase 2: a fake batch of data is created, sent to the trajectory
@@ -33,9 +30,7 @@ class DynamicTFPolicy(TFPolicy):
function. The loss and stats functions are initialized with these
placeholders.
Initialization defines the static graph. When using eager execution, a
corresponding imperative py_function is also generated as an embedded op
inside the static graph.
Initialization defines the static graph.
"""
def __init__(self,
@@ -198,8 +193,6 @@ class DynamicTFPolicy(TFPolicy):
batch_divisibility_req=batch_divisibility_req)
# Phase 2 init
self._needs_eager_conversion = set()
self._eager_tensors = {}
before_loss_init(self, obs_space, action_space, config)
if not existing_inputs:
self._initialize_loss()
@@ -211,17 +204,6 @@ class DynamicTFPolicy(TFPolicy):
"""
return self.input_dict
def convert_to_eager(self, tensor):
"""Convert a graph tensor accessed in the loss to an eager tensor.
Experimental.
"""
if tf.executing_eagerly():
return self._eager_tensors[tensor]
else:
self._needs_eager_conversion.add(tensor)
return tensor
@override(TFPolicy)
def copy(self, existing_inputs):
"""Creates a copy of self using existing input placeholders."""
@@ -260,10 +242,6 @@ class DynamicTFPolicy(TFPolicy):
loss_inputs = [(k, existing_inputs[i])
for i, (k, _) in enumerate(self._loss_inputs)]
if self.config["use_eager"]:
loss, new_stats = instance._gen_eager_loss_op(loss_inputs)
instance._stats_fetches = new_stats
TFPolicy._initialize_loss(instance, loss, loss_inputs)
if instance._grad_stats_fn:
instance._stats_fetches.update(
@@ -348,14 +326,6 @@ class DynamicTFPolicy(TFPolicy):
for k in sorted(batch_tensors.accessed_keys):
loss_inputs.append((k, batch_tensors[k]))
# XXX experimental support for automatically eagerifying the loss.
# The main limitation right now is that TF doesn't support mixing eager
# and non-eager tensors, so losses that read non-eager tensors through
# `policy` need to use `policy.convert_to_eager(tensor)`.
if self.config["use_eager"]:
loss, new_stats = self._gen_eager_loss_op(loss_inputs)
self._stats_fetches = new_stats
TFPolicy._initialize_loss(self, loss, loss_inputs)
if self._grad_stats_fn:
self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
@@ -368,36 +338,3 @@ class DynamicTFPolicy(TFPolicy):
if self._update_ops_fn:
self._update_ops = self._update_ops_fn(self)
return loss
def _gen_eager_loss_op(self, loss_inputs):
graph_tensors = list(self._needs_eager_conversion)
stat_items = list(self._stats_fetches.items())
def gen_loss(model_outputs, *args):
# fill in the batch tensor dict with eager ensors
eager_inputs = dict(
zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)]))
# fill in the eager versions of all accessed graph tensors
self._eager_tensors = dict(
zip(graph_tensors, args[len(loss_inputs):]))
# patch the action dist to use eager mode tensors
self.action_dist.inputs = model_outputs
loss = self._loss_fn(self, eager_inputs)
if self._stats_fn:
stats = self._stats_fn(self, eager_inputs)
return [loss] + [stats[k] for (k, v) in stat_items]
eager_out = tf.py_function(
gen_loss,
# cast works around TypeError: Cannot convert provided value
# to EagerTensor. Provided value: 0.0 Requested dtype: int64
[self.model_out] + [
tf.cast(v, tf.float32) for (k, v) in loss_inputs
] + [tf.cast(t, tf.float32) for t in graph_tensors],
Tout=[tf.float32] + [v.dtype for (k, v) in stat_items])
stats = {
k: stat_tensor
for (stat_tensor, (k, v)) in zip(eager_out[1:], stat_items)
}
return eager_out[0], stats
@@ -41,7 +41,7 @@ def build_tf_policy(name,
This means that you can e.g., depend on any policy attributes created in
the running of `loss_fn` in later functions such as `stats_fn`.
In eager mode (experimental), the following functions will be run
In eager mode (to be implemented), the following functions will be run
repeatedly on each eager execution: loss_fn, stats_fn
This means that these functions should not define any variables internally,