mirror of
https://github.com/wassname/ray.git
synced 2026-06-30 19:08:52 +08:00
[rllib] Remove experimental eager support
This commit is contained in:
@@ -41,9 +41,8 @@ def actor_critic_loss(policy, batch_tensors):
|
||||
policy.loss = A3CLoss(
|
||||
policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
|
||||
batch_tensors[Postprocessing.ADVANTAGES],
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS],
|
||||
policy.convert_to_eager(policy.vf), policy.config["vf_loss_coeff"],
|
||||
policy.config["entropy_coeff"])
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
|
||||
policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
|
||||
return policy.loss.total_loss
|
||||
|
||||
|
||||
@@ -91,11 +90,10 @@ class ValueNetworkMixin(object):
|
||||
|
||||
def stats(policy, batch_tensors):
|
||||
return {
|
||||
"cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"policy_entropy": policy.loss.entropy,
|
||||
"var_gnorm": tf.global_norm(
|
||||
[policy.convert_to_eager(x) for x in policy.var_list]),
|
||||
"var_gnorm": tf.global_norm([x for x in policy.var_list]),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
}
|
||||
|
||||
|
||||
@@ -107,9 +107,8 @@ class PPOLoss(object):
|
||||
|
||||
def ppo_surrogate_loss(policy, batch_tensors):
|
||||
if policy.state_in:
|
||||
max_seq_len = tf.reduce_max(policy.convert_to_eager(policy.seq_lens))
|
||||
mask = tf.sequence_mask(
|
||||
policy.convert_to_eager(policy.seq_lens), max_seq_len)
|
||||
max_seq_len = tf.reduce_max(policy.seq_lens)
|
||||
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
|
||||
mask = tf.reshape(mask, [-1])
|
||||
else:
|
||||
mask = tf.ones_like(
|
||||
@@ -123,10 +122,10 @@ def ppo_surrogate_loss(policy, batch_tensors):
|
||||
batch_tensors[BEHAVIOUR_LOGITS],
|
||||
batch_tensors[SampleBatch.VF_PREDS],
|
||||
policy.action_dist,
|
||||
policy.convert_to_eager(policy.value_function),
|
||||
policy.convert_to_eager(policy.kl_coeff),
|
||||
policy.value_function,
|
||||
policy.kl_coeff,
|
||||
mask,
|
||||
entropy_coeff=policy.convert_to_eager(policy.entropy_coeff),
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_param=policy.config["clip_param"],
|
||||
vf_clip_param=policy.config["vf_clip_param"],
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
@@ -137,19 +136,17 @@ def ppo_surrogate_loss(policy, batch_tensors):
|
||||
|
||||
def kl_and_loss_stats(policy, batch_tensors):
|
||||
return {
|
||||
"cur_kl_coeff": tf.cast(
|
||||
policy.convert_to_eager(policy.kl_coeff), tf.float64),
|
||||
"cur_lr": tf.cast(policy.convert_to_eager(policy.cur_lr), tf.float64),
|
||||
"cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"total_loss": policy.loss_obj.loss,
|
||||
"policy_loss": policy.loss_obj.mean_policy_loss,
|
||||
"vf_loss": policy.loss_obj.mean_vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS],
|
||||
policy.convert_to_eager(policy.value_function)),
|
||||
policy.value_function),
|
||||
"kl": policy.loss_obj.mean_kl,
|
||||
"entropy": policy.loss_obj.mean_entropy,
|
||||
"entropy_coeff": tf.cast(
|
||||
policy.convert_to_eager(policy.entropy_coeff), tf.float64),
|
||||
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -67,9 +67,7 @@ COMMON_CONFIG = {
|
||||
},
|
||||
# Whether to attempt to continue training if a worker crashes.
|
||||
"ignore_worker_failures": False,
|
||||
# Execute TF loss functions in eager mode. This is currently experimental
|
||||
# and only really works with the basic PG algorithm.
|
||||
"use_eager": False,
|
||||
# Log system resource metrics to results.
|
||||
"log_sys_usage": True,
|
||||
|
||||
# === Policy ===
|
||||
|
||||
@@ -51,11 +51,6 @@ def policy_gradient_loss(policy, batch_tensors):
|
||||
|
||||
Here `compute_penalty` prints the actions and rewards for debugging, and
|
||||
also computes a (dummy) penalty term to add to the loss.
|
||||
|
||||
Alternatively, you can set config["use_eager"] = True, which will try to
|
||||
automatically eagerify the entire loss function. However, this only works
|
||||
if your loss doesn't reference any non-eager tensors. It also won't work
|
||||
with the multi-GPU optimizer used by PPO.
|
||||
"""
|
||||
|
||||
def compute_penalty(actions, rewards):
|
||||
|
||||
@@ -23,9 +23,6 @@ logger = logging.getLogger(__name__)
|
||||
class DynamicTFPolicy(TFPolicy):
|
||||
"""A TFPolicy that auto-defines placeholders dynamically at runtime.
|
||||
|
||||
This class also supports eager execution if config["use_eager"] is True.
|
||||
Eager execution is implemented using a py_function op inside graph mode.
|
||||
|
||||
Initialization of this class occurs in two phases.
|
||||
* Phase 1: the model is created and model variables are initialized.
|
||||
* Phase 2: a fake batch of data is created, sent to the trajectory
|
||||
@@ -33,9 +30,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
function. The loss and stats functions are initialized with these
|
||||
placeholders.
|
||||
|
||||
Initialization defines the static graph. When using eager execution, a
|
||||
corresponding imperative py_function is also generated as an embedded op
|
||||
inside the static graph.
|
||||
Initialization defines the static graph.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
@@ -198,8 +193,6 @@ class DynamicTFPolicy(TFPolicy):
|
||||
batch_divisibility_req=batch_divisibility_req)
|
||||
|
||||
# Phase 2 init
|
||||
self._needs_eager_conversion = set()
|
||||
self._eager_tensors = {}
|
||||
before_loss_init(self, obs_space, action_space, config)
|
||||
if not existing_inputs:
|
||||
self._initialize_loss()
|
||||
@@ -211,17 +204,6 @@ class DynamicTFPolicy(TFPolicy):
|
||||
"""
|
||||
return self.input_dict
|
||||
|
||||
def convert_to_eager(self, tensor):
|
||||
"""Convert a graph tensor accessed in the loss to an eager tensor.
|
||||
|
||||
Experimental.
|
||||
"""
|
||||
if tf.executing_eagerly():
|
||||
return self._eager_tensors[tensor]
|
||||
else:
|
||||
self._needs_eager_conversion.add(tensor)
|
||||
return tensor
|
||||
|
||||
@override(TFPolicy)
|
||||
def copy(self, existing_inputs):
|
||||
"""Creates a copy of self using existing input placeholders."""
|
||||
@@ -260,10 +242,6 @@ class DynamicTFPolicy(TFPolicy):
|
||||
loss_inputs = [(k, existing_inputs[i])
|
||||
for i, (k, _) in enumerate(self._loss_inputs)]
|
||||
|
||||
if self.config["use_eager"]:
|
||||
loss, new_stats = instance._gen_eager_loss_op(loss_inputs)
|
||||
instance._stats_fetches = new_stats
|
||||
|
||||
TFPolicy._initialize_loss(instance, loss, loss_inputs)
|
||||
if instance._grad_stats_fn:
|
||||
instance._stats_fetches.update(
|
||||
@@ -348,14 +326,6 @@ class DynamicTFPolicy(TFPolicy):
|
||||
for k in sorted(batch_tensors.accessed_keys):
|
||||
loss_inputs.append((k, batch_tensors[k]))
|
||||
|
||||
# XXX experimental support for automatically eagerifying the loss.
|
||||
# The main limitation right now is that TF doesn't support mixing eager
|
||||
# and non-eager tensors, so losses that read non-eager tensors through
|
||||
# `policy` need to use `policy.convert_to_eager(tensor)`.
|
||||
if self.config["use_eager"]:
|
||||
loss, new_stats = self._gen_eager_loss_op(loss_inputs)
|
||||
self._stats_fetches = new_stats
|
||||
|
||||
TFPolicy._initialize_loss(self, loss, loss_inputs)
|
||||
if self._grad_stats_fn:
|
||||
self._stats_fetches.update(self._grad_stats_fn(self, self._grads))
|
||||
@@ -368,36 +338,3 @@ class DynamicTFPolicy(TFPolicy):
|
||||
if self._update_ops_fn:
|
||||
self._update_ops = self._update_ops_fn(self)
|
||||
return loss
|
||||
|
||||
def _gen_eager_loss_op(self, loss_inputs):
|
||||
graph_tensors = list(self._needs_eager_conversion)
|
||||
stat_items = list(self._stats_fetches.items())
|
||||
|
||||
def gen_loss(model_outputs, *args):
|
||||
# fill in the batch tensor dict with eager ensors
|
||||
eager_inputs = dict(
|
||||
zip([k for (k, v) in loss_inputs], args[:len(loss_inputs)]))
|
||||
# fill in the eager versions of all accessed graph tensors
|
||||
self._eager_tensors = dict(
|
||||
zip(graph_tensors, args[len(loss_inputs):]))
|
||||
# patch the action dist to use eager mode tensors
|
||||
self.action_dist.inputs = model_outputs
|
||||
loss = self._loss_fn(self, eager_inputs)
|
||||
if self._stats_fn:
|
||||
stats = self._stats_fn(self, eager_inputs)
|
||||
return [loss] + [stats[k] for (k, v) in stat_items]
|
||||
|
||||
eager_out = tf.py_function(
|
||||
gen_loss,
|
||||
# cast works around TypeError: Cannot convert provided value
|
||||
# to EagerTensor. Provided value: 0.0 Requested dtype: int64
|
||||
[self.model_out] + [
|
||||
tf.cast(v, tf.float32) for (k, v) in loss_inputs
|
||||
] + [tf.cast(t, tf.float32) for t in graph_tensors],
|
||||
Tout=[tf.float32] + [v.dtype for (k, v) in stat_items])
|
||||
stats = {
|
||||
k: stat_tensor
|
||||
for (stat_tensor, (k, v)) in zip(eager_out[1:], stat_items)
|
||||
}
|
||||
|
||||
return eager_out[0], stats
|
||||
|
||||
@@ -41,7 +41,7 @@ def build_tf_policy(name,
|
||||
This means that you can e.g., depend on any policy attributes created in
|
||||
the running of `loss_fn` in later functions such as `stats_fn`.
|
||||
|
||||
In eager mode (experimental), the following functions will be run
|
||||
In eager mode (to be implemented), the following functions will be run
|
||||
repeatedly on each eager execution: loss_fn, stats_fn
|
||||
|
||||
This means that these functions should not define any variables internally,
|
||||
|
||||
Reference in New Issue
Block a user