mirror of
https://github.com/wassname/ray.git
synced 2026-07-05 05:05:21 +08:00
[RLlib] Minor cleanup in preparation to tf2.x support. (#9130)
* WIP. * Fixes. * LINT. * Fixes. * Fixes and LINT. * WIP.
This commit is contained in:
@@ -42,8 +42,8 @@ class TensorFlowVariables:
|
||||
Args:
|
||||
output (tf.Operation, List[tf.Operation]): The tensorflow
|
||||
operation to extract all variables from.
|
||||
sess (tf.Session): Session used for running the get and set
|
||||
methods.
|
||||
sess (Optional[tf.Session]): Optional tf.Session used for running
|
||||
the get and set methods in tf graph mode.
|
||||
input_variables (List[tf.Variables]): Variables to include in the
|
||||
list.
|
||||
"""
|
||||
|
||||
+11
-10
@@ -496,7 +496,7 @@ py_test(
|
||||
"agents/ppo/tests/test.py"] # TODO(sven): Move down once PR 6889 merged
|
||||
)
|
||||
|
||||
# DDPPO
|
||||
# PPO: DDPPO
|
||||
py_test(
|
||||
name = "test_ddppo",
|
||||
tags = ["agents_dir"],
|
||||
@@ -504,7 +504,7 @@ py_test(
|
||||
srcs = ["agents/ppo/tests/test_ddppo.py"]
|
||||
)
|
||||
|
||||
# APPO
|
||||
# PPO: APPO
|
||||
py_test(
|
||||
name = "test_appo",
|
||||
tags = ["agents_dir"],
|
||||
@@ -512,7 +512,15 @@ py_test(
|
||||
srcs = ["agents/ppo/tests/test_appo.py"]
|
||||
)
|
||||
|
||||
# SAC
|
||||
# QMixTrainer
|
||||
py_test(
|
||||
name = "test_qmix",
|
||||
tags = ["agents_dir"],
|
||||
size = "medium",
|
||||
srcs = ["agents/qmix/tests/test_qmix.py"]
|
||||
)
|
||||
|
||||
# SACTrainer
|
||||
py_test(
|
||||
name = "test_sac",
|
||||
tags = ["agents_dir"],
|
||||
@@ -1103,13 +1111,6 @@ py_test(
|
||||
srcs = ["tests/test_attention_net_learning.py"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "tests/test_avail_actions_qmix",
|
||||
tags = ["tests_dir", "tests_dir_A"],
|
||||
size = "medium",
|
||||
srcs = ["tests/test_avail_actions_qmix.py"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "tests/test_catalog",
|
||||
tags = ["tests_dir", "tests_dir_C"],
|
||||
|
||||
@@ -27,7 +27,7 @@ class A3CLoss:
|
||||
self.pi_loss = -tf.reduce_sum(log_prob * advantages)
|
||||
|
||||
delta = vf - v_target
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
|
||||
self.entropy = tf.reduce_sum(action_dist.entropy())
|
||||
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
|
||||
self.entropy * entropy_coeff)
|
||||
@@ -90,14 +90,15 @@ def stats(policy, train_batch):
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"policy_entropy": policy.loss.entropy,
|
||||
"var_gnorm": tf.global_norm(list(policy.model.trainable_variables())),
|
||||
"var_gnorm": tf.linalg.global_norm(
|
||||
list(policy.model.trainable_variables())),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
}
|
||||
|
||||
|
||||
def grad_stats(policy, train_batch, grads):
|
||||
return {
|
||||
"grad_gnorm": tf.global_norm(grads),
|
||||
"grad_gnorm": tf.linalg.global_norm(grads),
|
||||
"vf_explained_var": explained_variance(
|
||||
train_batch[Postprocessing.VALUE_TARGETS],
|
||||
policy.model.value_function()),
|
||||
|
||||
@@ -157,6 +157,12 @@ DEFAULT_CONFIG = with_common_config({
|
||||
|
||||
|
||||
def validate_config(config):
|
||||
if config["model"]["custom_model"]:
|
||||
logger.warning(
|
||||
"Setting use_state_preprocessor=True since a custom model "
|
||||
"was specified.")
|
||||
config["use_state_preprocessor"] = True
|
||||
|
||||
# TODO(sven): Remove at some point.
|
||||
# Backward compatibility of noise-based exploration config.
|
||||
schedule_max_timesteps = None
|
||||
@@ -191,8 +197,7 @@ def validate_config(config):
|
||||
|
||||
if config.get("parameter_noise", DEPRECATED_VALUE) != DEPRECATED_VALUE:
|
||||
deprecation_warning("parameter_noise", "exploration_config={"
|
||||
"type=ParameterNoise"
|
||||
"}")
|
||||
"type=ParameterNoise}")
|
||||
|
||||
if config["exploration_config"]["type"] == "ParameterNoise":
|
||||
if config["batch_mode"] != "complete_episodes":
|
||||
|
||||
@@ -15,9 +15,9 @@ from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_action_dist import Deterministic
|
||||
from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.policy.tf_policy import TFPolicy
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \
|
||||
make_tf_callable
|
||||
@@ -36,22 +36,6 @@ TWIN_Q_TARGET_SCOPE = "twin_target_critic"
|
||||
|
||||
|
||||
def build_ddpg_models(policy, observation_space, action_space, config):
|
||||
if config["model"]["custom_model"]:
|
||||
logger.warning(
|
||||
"Setting use_state_preprocessor=True since a custom model "
|
||||
"was specified.")
|
||||
config["use_state_preprocessor"] = True
|
||||
|
||||
if not isinstance(action_space, Box):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DDPG.".format(action_space))
|
||||
elif len(action_space.shape) > 1:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space has multiple dimensions "
|
||||
"{}. ".format(action_space.shape) +
|
||||
"Consider reshaping this into a single dimension, "
|
||||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
if policy.config["use_state_preprocessor"]:
|
||||
default_model = None # catalog decides
|
||||
num_outputs = 256 # arbitrary
|
||||
@@ -157,7 +141,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
if policy.config["smooth_target_policy"]:
|
||||
target_noise_clip = policy.config["target_noise_clip"]
|
||||
clipped_normal_sample = tf.clip_by_value(
|
||||
tf.random_normal(
|
||||
tf.random.normal(
|
||||
tf.shape(policy_tp1),
|
||||
stddev=policy.config["target_noise"]), -target_noise_clip,
|
||||
target_noise_clip)
|
||||
@@ -219,15 +203,17 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
errors = huber_loss(td_error, huber_threshold) + \
|
||||
huber_loss(twin_td_error, huber_threshold)
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(twin_td_error)
|
||||
errors = 0.5 * tf.math.square(td_error) + \
|
||||
0.5 * tf.math.square(twin_td_error)
|
||||
else:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
if use_huber:
|
||||
errors = huber_loss(td_error, huber_threshold)
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error)
|
||||
errors = 0.5 * tf.math.square(td_error)
|
||||
|
||||
critic_loss = tf.reduce_mean(train_batch[PRIO_WEIGHTS] * errors)
|
||||
critic_loss = tf.reduce_mean(
|
||||
tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors)
|
||||
actor_loss = -tf.reduce_mean(q_t_det_policy)
|
||||
|
||||
# Add l2-regularization if required.
|
||||
@@ -417,6 +403,19 @@ def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, config)
|
||||
|
||||
|
||||
def validate_spaces(pid, observation_space, action_space, config):
|
||||
if not isinstance(action_space, Box):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space ({}) of {} is not supported for "
|
||||
"DDPG.".format(action_space, pid))
|
||||
elif len(action_space.shape) > 1:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space ({}) of {} has multiple dimensions "
|
||||
"{}. ".format(action_space, pid, action_space.shape) +
|
||||
"Consider reshaping this into a single dimension, "
|
||||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
|
||||
DDPGTFPolicy = build_tf_policy(
|
||||
name="DDPGTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
|
||||
@@ -429,6 +428,7 @@ DDPGTFPolicy = build_tf_policy(
|
||||
gradients_fn=gradients_fn,
|
||||
apply_gradients_fn=build_apply_op,
|
||||
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
|
||||
validate_spaces=validate_spaces,
|
||||
before_init=before_init_fn,
|
||||
before_loss_init=setup_mid_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
|
||||
@@ -2,7 +2,7 @@ import logging
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.ddpg.ddpg_tf_policy import build_ddpg_models, \
|
||||
get_distribution_inputs_and_class
|
||||
get_distribution_inputs_and_class, validate_spaces
|
||||
from ray.rllib.agents.dqn.dqn_tf_policy import postprocess_nstep_and_prio, \
|
||||
PRIO_WEIGHTS
|
||||
from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
|
||||
@@ -269,6 +269,7 @@ DDPGTorchPolicy = build_torch_policy(
|
||||
postprocess_fn=postprocess_nstep_and_prio,
|
||||
extra_grad_process_fn=gradients_fn,
|
||||
optimizer_fn=make_ddpg_optimizers,
|
||||
validate_spaces=validate_spaces,
|
||||
before_init=before_init_fn,
|
||||
after_init=setup_late_mixins,
|
||||
action_distribution_fn=get_distribution_inputs_and_class,
|
||||
|
||||
@@ -234,8 +234,8 @@ class DistributionalQTFModel(TFModelV2):
|
||||
"""
|
||||
in_size = int(action_in.shape[1])
|
||||
|
||||
epsilon_in = tf.random_normal(shape=[in_size])
|
||||
epsilon_out = tf.random_normal(shape=[out_size])
|
||||
epsilon_in = tf.random.normal(shape=[in_size])
|
||||
epsilon_out = tf.random.normal(shape=[out_size])
|
||||
epsilon_in = self._f_epsilon(epsilon_in)
|
||||
epsilon_out = self._f_epsilon(epsilon_out)
|
||||
epsilon_w = tf.matmul(
|
||||
@@ -279,4 +279,4 @@ class DistributionalQTFModel(TFModelV2):
|
||||
return tf.nn.relu(action_activation)
|
||||
|
||||
def _f_epsilon(self, x):
|
||||
return tf.sign(x) * tf.sqrt(tf.abs(x))
|
||||
return tf.math.sign(x) * tf.math.sqrt(tf.math.abs(x))
|
||||
|
||||
@@ -54,11 +54,11 @@ class QLoss:
|
||||
r_tau = tf.clip_by_value(r_tau, v_min, v_max)
|
||||
b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
|
||||
lb = tf.floor(b)
|
||||
ub = tf.ceil(b)
|
||||
ub = tf.math.ceil(b)
|
||||
# indispensable judgement which is missed in most implementations
|
||||
# when b happens to be an integer, lb == ub, so pr_j(s', a*) will
|
||||
# be discarded because (ub-b) == (b-lb) == 0
|
||||
floor_equal_ceil = tf.to_float(tf.less(ub - lb, 0.5))
|
||||
floor_equal_ceil = tf.cast(tf.less(ub - lb, 0.5), tf.float32)
|
||||
|
||||
l_project = tf.one_hot(
|
||||
tf.cast(lb, dtype=tf.int32),
|
||||
|
||||
@@ -53,7 +53,7 @@ def build_q_losses(policy, model, dist_class, train_batch):
|
||||
is_training=True)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS],
|
||||
one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS].long(),
|
||||
policy.action_space.n)
|
||||
q_t_selected = torch.sum(q_t * one_hot_selection, 1)
|
||||
|
||||
|
||||
@@ -50,7 +50,7 @@ class DYNATorchModel(TorchModelV2, nn.Module):
|
||||
|
||||
# One-hot the actions.
|
||||
actions_flat = nn.functional.one_hot(
|
||||
actions, num_classes=self.action_space.n).float()
|
||||
actions.long(), num_classes=self.action_space.n).float()
|
||||
# Push through our underlying Model.
|
||||
next_obs, _ = self.forward({
|
||||
"obs_flat": torch.cat([observations, actions_flat], -1)
|
||||
|
||||
@@ -80,7 +80,7 @@ class VTraceLoss:
|
||||
behaviour_policy_logits=behaviour_logits,
|
||||
target_policy_logits=target_logits,
|
||||
actions=tf.unstack(actions, axis=2),
|
||||
discounts=tf.to_float(~dones) * discount,
|
||||
discounts=tf.cast(~dones, tf.float32) * discount,
|
||||
rewards=rewards,
|
||||
values=values,
|
||||
bootstrap_value=bootstrap_value,
|
||||
@@ -98,7 +98,7 @@ class VTraceLoss:
|
||||
|
||||
# The baseline loss.
|
||||
delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask)
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
|
||||
|
||||
# The entropy loss.
|
||||
self.entropy = tf.reduce_sum(
|
||||
@@ -228,7 +228,7 @@ def stats(policy, train_batch):
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"entropy": policy.loss.entropy,
|
||||
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
|
||||
"var_gnorm": tf.global_norm(policy.model.trainable_variables()),
|
||||
"var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
tf.reshape(policy.loss.value_targets, [-1]),
|
||||
@@ -238,7 +238,7 @@ def stats(policy, train_batch):
|
||||
|
||||
def grad_stats(policy, train_batch, grads):
|
||||
return {
|
||||
"grad_gnorm": tf.global_norm(grads),
|
||||
"grad_gnorm": tf.linalg.global_norm(grads),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ class ValueNetworkMixin:
|
||||
class ValueLoss:
|
||||
def __init__(self, state_values, cumulative_rewards):
|
||||
self.loss = 0.5 * tf.reduce_mean(
|
||||
tf.square(state_values - cumulative_rewards))
|
||||
tf.math.square(state_values - cumulative_rewards))
|
||||
|
||||
|
||||
class ReweightedImitationLoss:
|
||||
@@ -39,13 +39,13 @@ class ReweightedImitationLoss:
|
||||
# update averaged advantage norm
|
||||
update_adv_norm = tf.assign_add(
|
||||
ref=policy._ma_adv_norm,
|
||||
value=1e-6 *
|
||||
(tf.reduce_mean(tf.square(adv)) - policy._ma_adv_norm))
|
||||
value=1e-6 * (
|
||||
tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))
|
||||
|
||||
# exponentially weighted advantages
|
||||
with tf.control_dependencies([update_adv_norm]):
|
||||
exp_advs = tf.exp(
|
||||
beta * tf.divide(adv, 1e-8 + tf.sqrt(policy._ma_adv_norm)))
|
||||
exp_advs = tf.math.exp(beta * tf.math.divide(
|
||||
adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))
|
||||
|
||||
# log\pi_\theta(a|s)
|
||||
logprobs = action_dist.logp(actions)
|
||||
|
||||
@@ -78,7 +78,7 @@ class PPOSurrogateLoss:
|
||||
# The baseline loss
|
||||
delta = values - value_targets
|
||||
self.value_targets = value_targets
|
||||
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
|
||||
self.vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
|
||||
|
||||
# The entropy loss
|
||||
self.entropy = reduce_mean_valid(actions_entropy)
|
||||
@@ -159,7 +159,7 @@ class VTraceSurrogateLoss:
|
||||
behaviour_policy_logits=behaviour_logits,
|
||||
target_policy_logits=old_policy_behaviour_logits,
|
||||
actions=tf.unstack(actions, axis=2),
|
||||
discounts=tf.to_float(~dones) * discount,
|
||||
discounts=tf.cast(~dones, tf.float32) * discount,
|
||||
rewards=rewards,
|
||||
values=values,
|
||||
bootstrap_value=bootstrap_value,
|
||||
@@ -185,7 +185,7 @@ class VTraceSurrogateLoss:
|
||||
# The baseline loss
|
||||
delta = values - self.vtrace_returns.vs
|
||||
self.value_targets = self.vtrace_returns.vs
|
||||
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
|
||||
self.vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
|
||||
|
||||
# The entropy loss
|
||||
self.entropy = reduce_mean_valid(actions_entropy)
|
||||
@@ -350,7 +350,7 @@ def stats(policy, train_batch):
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"entropy": policy.loss.entropy,
|
||||
"var_gnorm": tf.global_norm(policy.model.trainable_variables()),
|
||||
"var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
tf.reshape(policy.loss.value_targets, [-1]),
|
||||
|
||||
@@ -89,10 +89,10 @@ class PPOLoss:
|
||||
self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
|
||||
|
||||
if use_gae:
|
||||
vf_loss1 = tf.square(value_fn - value_targets)
|
||||
vf_loss1 = tf.math.square(value_fn - value_targets)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(
|
||||
value_fn - vf_preds, -vf_clip_param, vf_clip_param)
|
||||
vf_loss2 = tf.square(vf_clipped - value_targets)
|
||||
vf_loss2 = tf.math.square(vf_clipped - value_targets)
|
||||
vf_loss = tf.maximum(vf_loss1, vf_loss2)
|
||||
self.mean_vf_loss = reduce_mean_valid(vf_loss)
|
||||
loss = reduce_mean_valid(
|
||||
|
||||
+11
-1
@@ -1,8 +1,12 @@
|
||||
import logging
|
||||
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
|
||||
from ray.rllib.agents.sac.sac_tf_policy import SACTFPolicy
|
||||
from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPTIMIZER_SHARED_CONFIGS = [
|
||||
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
|
||||
"prioritized_replay_beta", "prioritized_replay_eps",
|
||||
@@ -131,6 +135,12 @@ def get_policy_class(config):
|
||||
|
||||
|
||||
def validate_config(config):
|
||||
if config["model"].get("custom_model"):
|
||||
logger.warning(
|
||||
"Setting use_state_preprocessor=True since a custom model "
|
||||
"was specified.")
|
||||
config["use_state_preprocessor"] = True
|
||||
|
||||
if config.get("grad_norm_clipping", DEPRECATED_VALUE) != DEPRECATED_VALUE:
|
||||
deprecation_warning("grad_norm_clipping", "grad_clip")
|
||||
config["grad_clip"] = config.pop("grad_norm_clipping")
|
||||
@@ -154,7 +164,7 @@ def validate_config(config):
|
||||
SACTrainer = GenericOffPolicyTrainer.with_updates(
|
||||
name="SAC",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
validate_config=validate_config,
|
||||
default_policy=SACTFPolicy,
|
||||
get_policy_class=get_policy_class,
|
||||
validate_config=validate_config,
|
||||
)
|
||||
|
||||
@@ -24,21 +24,6 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_sac_model(policy, obs_space, action_space, config):
|
||||
if config["model"].get("custom_model"):
|
||||
logger.warning(
|
||||
"Setting use_state_preprocessor=True since a custom model "
|
||||
"was specified.")
|
||||
config["use_state_preprocessor"] = True
|
||||
if not isinstance(action_space, (Box, Discrete)):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for SAC.".format(action_space))
|
||||
if isinstance(action_space, Box) and len(action_space.shape) > 1:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space has multiple dimensions "
|
||||
"{}. ".format(action_space.shape) +
|
||||
"Consider reshaping this into a single dimension, "
|
||||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
# 2 cases:
|
||||
# 1) with separate state-preprocessor (before obs+action concat).
|
||||
# 2) no separate state-preprocessor: concat obs+actions right away.
|
||||
@@ -425,6 +410,19 @@ def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, config)
|
||||
|
||||
|
||||
def validate_spaces(pid, observation_space, action_space, config):
|
||||
if not isinstance(action_space, (Box, Discrete)):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space ({}) of {} is not supported for "
|
||||
"SAC.".format(action_space, pid))
|
||||
if isinstance(action_space, Box) and len(action_space.shape) > 1:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space ({}) of {} has multiple dimensions "
|
||||
"{}. ".format(action_space, pid, action_space.shape) +
|
||||
"Consider reshaping this into a single dimension, "
|
||||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
|
||||
SACTFPolicy = build_tf_policy(
|
||||
name="SACTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
|
||||
@@ -439,6 +437,7 @@ SACTFPolicy = build_tf_policy(
|
||||
mixins=[
|
||||
TargetNetworkMixin, ActorCriticOptimizerMixin, ComputeTDErrorMixin
|
||||
],
|
||||
validate_spaces=validate_spaces,
|
||||
before_init=setup_early_mixins,
|
||||
before_loss_init=setup_mid_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
|
||||
@@ -5,7 +5,7 @@ import ray
|
||||
import ray.experimental.tf_utils
|
||||
from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
|
||||
from ray.rllib.agents.sac.sac_tf_policy import build_sac_model, \
|
||||
postprocess_trajectory
|
||||
postprocess_trajectory, validate_spaces
|
||||
from ray.rllib.agents.dqn.dqn_tf_policy import PRIO_WEIGHTS
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.torch_policy_template import build_torch_policy
|
||||
@@ -336,6 +336,7 @@ SACTorchPolicy = build_torch_policy(
|
||||
postprocess_fn=postprocess_trajectory,
|
||||
extra_grad_process_fn=apply_grad_clipping,
|
||||
optimizer_fn=optimizer_fn,
|
||||
validate_spaces=validate_spaces,
|
||||
after_init=setup_late_mixins,
|
||||
make_model_and_action_dist=build_sac_model_and_action_dist,
|
||||
mixins=[TargetNetworkMixin, ComputeTDErrorMixin],
|
||||
|
||||
@@ -68,6 +68,7 @@ class TestSAC(unittest.TestCase):
|
||||
results = trainer.train()
|
||||
print(results)
|
||||
check_compute_single_action(trainer)
|
||||
trainer.stop()
|
||||
|
||||
def test_sac_loss_function(self):
|
||||
"""Tests SAC loss function results across all frameworks."""
|
||||
@@ -164,7 +165,7 @@ class TestSAC(unittest.TestCase):
|
||||
|
||||
# Set all weights (of all nets) to fixed values.
|
||||
if weights_dict is None:
|
||||
assert fw == "tf" # Start with the tf vars-dict.
|
||||
assert fw in ["tf", "tfe"] # Start with the tf vars-dict.
|
||||
weights_dict = policy.get_weights()
|
||||
else:
|
||||
assert fw == "torch" # Then transfer that to torch Model.
|
||||
@@ -176,7 +177,7 @@ class TestSAC(unittest.TestCase):
|
||||
if fw == "tf":
|
||||
log_alpha = weights_dict["default_policy/log_alpha"]
|
||||
elif fw == "torch":
|
||||
# Actually convert to torch tensors.
|
||||
# Actually convert to torch tensors (by accessing everything).
|
||||
input_ = policy._lazy_tensor_dict(input_)
|
||||
input_ = {k: input_[k] for k in input_.keys()}
|
||||
log_alpha = policy.model.log_alpha.detach().numpy()[0]
|
||||
|
||||
+11
-13
@@ -7,7 +7,7 @@ import os
|
||||
import pickle
|
||||
import time
|
||||
import tempfile
|
||||
from typing import Callable, List, Dict, Union, Any
|
||||
from typing import Callable, List, Dict, Union
|
||||
|
||||
import ray
|
||||
from ray.exceptions import RayError
|
||||
@@ -701,9 +701,6 @@ class Trainer(Trainable):
|
||||
config (dict): The Trainer's config.
|
||||
num_workers (int): Number of remote rollout workers to create.
|
||||
0 for local only.
|
||||
remote_config_updates (Optional[List[dict]]): A list of config
|
||||
dicts to update `config` with for each Worker (len must be
|
||||
same as `num_workers`).
|
||||
|
||||
Returns:
|
||||
WorkerSet: The created WorkerSet.
|
||||
@@ -778,9 +775,9 @@ class Trainer(Trainable):
|
||||
@PublicAPI
|
||||
def compute_action(self,
|
||||
observation: TensorStructType,
|
||||
state: List[Any] = None,
|
||||
state: List[TensorStructType] = None,
|
||||
prev_action: TensorStructType = None,
|
||||
prev_reward: int = None,
|
||||
prev_reward: float = None,
|
||||
info: EnvInfoDict = None,
|
||||
policy_id: PolicyID = DEFAULT_POLICY_ID,
|
||||
full_fetch: bool = False,
|
||||
@@ -791,16 +788,17 @@ class Trainer(Trainable):
|
||||
self.get_policy(policy_id) and call compute_actions() on it directly.
|
||||
|
||||
Arguments:
|
||||
observation (obj): observation from the environment.
|
||||
state (list): RNN hidden state, if any. If state is not None,
|
||||
then all of compute_single_action(...) is returned
|
||||
observation (TensorStructType): observation from the environment.
|
||||
state (List[TensorStructType]): RNN hidden state, if any. If state
|
||||
is not None, then all of compute_single_action(...) is returned
|
||||
(computed action, rnn state(s), logits dictionary).
|
||||
Otherwise compute_single_action(...)[0] is returned
|
||||
(computed action).
|
||||
prev_action (obj): previous action value, if any
|
||||
prev_reward (int): previous reward, if any
|
||||
info (dict): info object, if any
|
||||
policy_id (str): Policy to query (only applies to multi-agent).
|
||||
prev_action (TensorStructType): Previous action value, if any.
|
||||
prev_reward (float): Previous reward, if any.
|
||||
info (EnvInfoDict): info object, if any
|
||||
policy_id (PolicyID): Policy to query (only applies to
|
||||
multi-agent).
|
||||
full_fetch (bool): Whether to return extra action fetch results.
|
||||
This is always set to True if RNN state is specified.
|
||||
explore (bool): Whether to pick an exploitation or exploration
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from typing import Callable, Optional, List, Iterable
|
||||
import logging
|
||||
import time
|
||||
from typing import Callable, Optional, List, Iterable
|
||||
|
||||
from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
|
||||
from ray.rllib.evaluation.worker_set import WorkerSet
|
||||
@@ -34,20 +34,21 @@ def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict):
|
||||
|
||||
@DeveloperAPI
|
||||
def build_trainer(
|
||||
name: str,
|
||||
default_policy: Optional[Policy],
|
||||
default_config: TrainerConfigDict = None,
|
||||
validate_config: Callable[[TrainerConfigDict], None] = None,
|
||||
get_initial_state=None, # DEPRECATED
|
||||
get_policy_class: Callable[[TrainerConfigDict], Policy] = None,
|
||||
before_init: Callable[[Trainer], None] = None,
|
||||
make_workers=None, # DEPRECATED
|
||||
make_policy_optimizer=None, # DEPRECATED
|
||||
after_init: Callable[[Trainer], None] = None,
|
||||
before_train_step=None, # DEPRECATED
|
||||
after_optimizer_step=None, # DEPRECATED
|
||||
after_train_result=None, # DEPRECATED
|
||||
collect_metrics_fn=None, # DEPRECATED
|
||||
name: str,
|
||||
default_policy: Optional[Policy],
|
||||
*,
|
||||
default_config: TrainerConfigDict = None,
|
||||
validate_config: Callable[[TrainerConfigDict], None] = None,
|
||||
get_initial_state=None, # DEPRECATED
|
||||
get_policy_class: Callable[[TrainerConfigDict], Policy] = None,
|
||||
before_init: Callable[[Trainer], None] = None,
|
||||
make_workers=None, # DEPRECATED
|
||||
make_policy_optimizer=None, # DEPRECATED
|
||||
after_init: Callable[[Trainer], None] = None,
|
||||
before_train_step=None, # DEPRECATED
|
||||
after_optimizer_step=None, # DEPRECATED
|
||||
after_train_result=None, # DEPRECATED
|
||||
collect_metrics_fn=None, # DEPRECATED
|
||||
before_evaluate_fn: Callable[[Trainer], None] = None,
|
||||
mixins: List[type] = None,
|
||||
execution_plan: Callable[[WorkerSet, TrainerConfigDict], Iterable[
|
||||
@@ -64,19 +65,20 @@ def build_trainer(
|
||||
default_policy (cls): the default Policy class to use
|
||||
default_config (dict): The default config dict of the algorithm,
|
||||
otherwise uses the Trainer default config.
|
||||
validate_config (func): optional callback that checks a given config
|
||||
for correctness. It may mutate the config as needed.
|
||||
get_policy_class (func): optional callback that takes a config and
|
||||
returns the policy class to override the default with
|
||||
before_init (func): optional function to run at the start of trainer
|
||||
init that takes the trainer instance as argument
|
||||
after_init (func): optional function to run at the end of trainer init
|
||||
that takes the trainer instance as argument
|
||||
before_evaluate_fn (func): callback to run before evaluation. This
|
||||
takes the trainer instance as argument.
|
||||
validate_config (Optional[callable]): Optional callable that takes the
|
||||
config to check for correctness. It may mutate the config as
|
||||
needed.
|
||||
get_policy_class (Optional[callable]): Optional callable that takes a
|
||||
config and returns the policy class to override the default with.
|
||||
before_init (Optional[callable]): Optional callable to run at the start
|
||||
of trainer init that takes the trainer instance as argument.
|
||||
after_init (Optional[callable]): Optional callable to run at the end of
|
||||
trainer init that takes the trainer instance as argument.
|
||||
before_evaluate_fn (Optional[callable]): callback to run before
|
||||
evaluation. This takes the trainer instance as argument.
|
||||
mixins (list): list of any class mixins for the returned trainer class.
|
||||
These mixins will be applied in order and will have higher
|
||||
precedence than the Trainer class
|
||||
precedence than the Trainer class.
|
||||
execution_plan (func): Setup the distributed execution workflow.
|
||||
|
||||
Returns:
|
||||
|
||||
@@ -11,29 +11,29 @@ class BinaryAutoregressiveDistribution(ActionDistribution):
|
||||
"""Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
|
||||
|
||||
def deterministic_sample(self):
|
||||
# first, sample a1
|
||||
# First, sample a1.
|
||||
a1_dist = self._a1_distribution()
|
||||
a1 = a1_dist.deterministic_sample()
|
||||
|
||||
# sample a2 conditioned on a1
|
||||
# Sample a2 conditioned on a1.
|
||||
a2_dist = self._a2_distribution(a1)
|
||||
a2 = a2_dist.deterministic_sample()
|
||||
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
|
||||
|
||||
# return the action tuple
|
||||
# Return the action tuple.
|
||||
return (a1, a2)
|
||||
|
||||
def sample(self):
|
||||
# first, sample a1
|
||||
# First, sample a1.
|
||||
a1_dist = self._a1_distribution()
|
||||
a1 = a1_dist.sample()
|
||||
|
||||
# sample a2 conditioned on a1
|
||||
# Sample a2 conditioned on a1.
|
||||
a2_dist = self._a2_distribution(a1)
|
||||
a2 = a2_dist.sample()
|
||||
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
|
||||
|
||||
# return the action tuple
|
||||
# Return the action tuple.
|
||||
return (a1, a2)
|
||||
|
||||
def logp(self, actions):
|
||||
@@ -81,29 +81,29 @@ class TorchBinaryAutoregressiveDistribution(TorchDistributionWrapper):
|
||||
"""Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
|
||||
|
||||
def deterministic_sample(self):
|
||||
# first, sample a1
|
||||
# First, sample a1.
|
||||
a1_dist = self._a1_distribution()
|
||||
a1 = a1_dist.deterministic_sample()
|
||||
|
||||
# sample a2 conditioned on a1
|
||||
# Sample a2 conditioned on a1.
|
||||
a2_dist = self._a2_distribution(a1)
|
||||
a2 = a2_dist.deterministic_sample()
|
||||
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
|
||||
|
||||
# return the action tuple
|
||||
# Return the action tuple.
|
||||
return (a1, a2)
|
||||
|
||||
def sample(self):
|
||||
# first, sample a1
|
||||
# First, sample a1.
|
||||
a1_dist = self._a1_distribution()
|
||||
a1 = a1_dist.sample()
|
||||
|
||||
# sample a2 conditioned on a1
|
||||
# Sample a2 conditioned on a1.
|
||||
a2_dist = self._a2_distribution(a1)
|
||||
a2 = a2_dist.sample()
|
||||
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
|
||||
|
||||
# return the action tuple
|
||||
# Return the action tuple.
|
||||
return (a1, a2)
|
||||
|
||||
def logp(self, actions):
|
||||
|
||||
@@ -56,7 +56,7 @@ class ParametricActionsModel(DistributionalQTFModel):
|
||||
action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)
|
||||
|
||||
# Mask out invalid actions (use tf.float32.min for stability)
|
||||
inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
|
||||
inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
|
||||
return action_logits + inf_mask, state
|
||||
|
||||
def value_function(self):
|
||||
|
||||
@@ -416,7 +416,8 @@ class ModelCatalog:
|
||||
name, **model_kwargs)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Framework must be 'tf' or 'torch': {}".format(framework))
|
||||
"`framework` must be 'tf|tfe|torch', but is "
|
||||
"{}!".format(framework))
|
||||
|
||||
@staticmethod
|
||||
@DeveloperAPI
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.tf.recurrent_net import \
|
||||
RecurrentNetwork
|
||||
from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
|
||||
from ray.rllib.models.tf.visionnet import VisionNetwork
|
||||
|
||||
__all__ = [
|
||||
|
||||
@@ -1,10 +1,14 @@
|
||||
from ray.rllib.models.tf.layers.gru_gate import GRUGate
|
||||
from ray.rllib.models.tf.layers.noisy_layer import NoisyLayer
|
||||
from ray.rllib.models.tf.layers.relative_multi_head_attention import \
|
||||
RelativeMultiHeadAttention
|
||||
from ray.rllib.models.tf.layers.skip_connection import SkipConnection
|
||||
from ray.rllib.models.tf.layers.multi_head_attention import MultiHeadAttention
|
||||
|
||||
__all__ = [
|
||||
"GRUGate", "RelativeMultiHeadAttention", "SkipConnection",
|
||||
"MultiHeadAttention"
|
||||
"GRUGate",
|
||||
"MultiHeadAttention",
|
||||
"NoisyLayer",
|
||||
"RelativeMultiHeadAttention",
|
||||
"SkipConnection"
|
||||
]
|
||||
|
||||
@@ -0,0 +1,105 @@
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.utils.framework import get_activation_fn, get_variable, \
|
||||
try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class NoisyLayer(tf.keras.layers.Layer):
|
||||
"""A Layer that adds learnable Noise
|
||||
a common dense layer: y = w^{T}x + b
|
||||
a noisy layer: y = (w + \\epsilon_w*\\sigma_w)^{T}x +
|
||||
(b+\\epsilon_b*\\sigma_b)
|
||||
where \epsilon are random variables sampled from factorized normal
|
||||
distributions and \\sigma are trainable variables which are expected to
|
||||
vanish along the training procedure
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
prefix,
|
||||
out_size,
|
||||
sigma0,
|
||||
activation="relu"):
|
||||
"""Initializes a NoisyLayer object.
|
||||
|
||||
Args:
|
||||
prefix:
|
||||
out_size:
|
||||
sigma0:
|
||||
non_linear:
|
||||
"""
|
||||
super().__init__()
|
||||
self.prefix = prefix
|
||||
self.out_size = out_size
|
||||
# TF noise generation can be unreliable on GPU
|
||||
# If generating the noise on the CPU,
|
||||
# lowering sigma0 to 0.1 may be helpful
|
||||
self.sigma0 = sigma0 # 0.5~GPU, 0.1~CPU
|
||||
self.activation = activation
|
||||
# Variables.
|
||||
self.w = None # Weight matrix.
|
||||
self.b = None # Biases.
|
||||
self.sigma_w = None # Noise for weight matrix
|
||||
self.sigma_b = None # Noise for biases.
|
||||
|
||||
def build(self, input_shape):
|
||||
in_size = int(input_shape[1])
|
||||
|
||||
self.sigma_w = get_variable(
|
||||
value=tf.keras.initializers.RandomUniform(
|
||||
minval=-1.0 / np.sqrt(float(in_size)),
|
||||
maxval=1.0 / np.sqrt(float(in_size))),
|
||||
trainable=True,
|
||||
tf_name=self.prefix + "_sigma_w",
|
||||
shape=[in_size, self.out_size],
|
||||
dtype=tf.float32
|
||||
)
|
||||
|
||||
self.sigma_b = get_variable(
|
||||
value=tf.keras.initializers.Constant(
|
||||
self.sigma0 / np.sqrt(float(in_size))),
|
||||
trainable=True,
|
||||
tf_name=self.prefix + "_sigma_b",
|
||||
shape=[self.out_size],
|
||||
dtype=tf.float32,
|
||||
)
|
||||
|
||||
self.w = get_variable(
|
||||
value=tf.keras.initializers.GlorotUniform(),
|
||||
tf_name=self.prefix + "_fc_w",
|
||||
trainable=True,
|
||||
shape=[in_size, self.out_size],
|
||||
dtype=tf.float32,
|
||||
)
|
||||
|
||||
self.b = get_variable(
|
||||
value=tf.keras.initializers.Zeros(),
|
||||
tf_name=self.prefix + "_fc_b",
|
||||
trainable=True,
|
||||
shape=[self.out_size],
|
||||
dtype=tf.float32,
|
||||
)
|
||||
|
||||
def call(self, inputs):
|
||||
in_size = int(inputs.shape[1])
|
||||
epsilon_in = tf.random.normal(shape=[in_size])
|
||||
epsilon_out = tf.random.normal(shape=[self.out_size])
|
||||
epsilon_in = self._f_epsilon(epsilon_in)
|
||||
epsilon_out = self._f_epsilon(epsilon_out)
|
||||
epsilon_w = tf.matmul(
|
||||
a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
|
||||
epsilon_b = epsilon_out
|
||||
|
||||
action_activation = tf.matmul(
|
||||
inputs,
|
||||
self.w + self.sigma_w * epsilon_w) + \
|
||||
self.b + self.sigma_b * epsilon_b
|
||||
|
||||
fn = get_activation_fn(self.activation, framework="tf")
|
||||
if fn is not None:
|
||||
action_activation = fn(action_activation)
|
||||
return action_activation
|
||||
|
||||
def _f_epsilon(self, x):
|
||||
return tf.math.sign(x) * tf.math.sqrt(tf.math.abs(x))
|
||||
@@ -65,22 +65,23 @@ class Categorical(TFActionDistribution):
|
||||
|
||||
@override(ActionDistribution)
|
||||
def entropy(self):
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keep_dims=True)
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, axis=1, keep_dims=True)
|
||||
z0 = tf.reduce_sum(ea0, axis=1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=1)
|
||||
return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=1)
|
||||
|
||||
@override(ActionDistribution)
|
||||
def kl(self, other):
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keep_dims=True)
|
||||
a1 = other.inputs - tf.reduce_max(other.inputs, axis=1, keep_dims=True)
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True)
|
||||
a1 = other.inputs - tf.reduce_max(other.inputs, axis=1, keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
ea1 = tf.exp(a1)
|
||||
z0 = tf.reduce_sum(ea0, axis=1, keep_dims=True)
|
||||
z1 = tf.reduce_sum(ea1, axis=1, keep_dims=True)
|
||||
z0 = tf.reduce_sum(ea0, axis=1, keepdims=True)
|
||||
z1 = tf.reduce_sum(ea1, axis=1, keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
|
||||
return tf.reduce_sum(
|
||||
p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=1)
|
||||
|
||||
@override(TFActionDistribution)
|
||||
def _build_sample_op(self):
|
||||
@@ -230,8 +231,9 @@ class DiagGaussian(TFActionDistribution):
|
||||
@override(ActionDistribution)
|
||||
def logp(self, x):
|
||||
return -0.5 * tf.reduce_sum(
|
||||
tf.square((tf.to_float(x) - self.mean) / self.std), axis=1) - \
|
||||
0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) - \
|
||||
tf.math.square((tf.cast(x, tf.float32) - self.mean) / self.std),
|
||||
axis=1
|
||||
) - 0.5 * np.log(2.0 * np.pi) * tf.cast(tf.shape(x)[1], tf.float32) - \
|
||||
tf.reduce_sum(self.log_std, axis=1)
|
||||
|
||||
@override(ActionDistribution)
|
||||
@@ -239,8 +241,9 @@ class DiagGaussian(TFActionDistribution):
|
||||
assert isinstance(other, DiagGaussian)
|
||||
return tf.reduce_sum(
|
||||
other.log_std - self.log_std +
|
||||
(tf.square(self.std) + tf.square(self.mean - other.mean)) /
|
||||
(2.0 * tf.square(other.std)) - 0.5,
|
||||
(tf.math.square(self.std) +
|
||||
tf.math.square(self.mean - other.mean)) /
|
||||
(2.0 * tf.math.square(other.std)) - 0.5,
|
||||
axis=1)
|
||||
|
||||
@override(ActionDistribution)
|
||||
@@ -250,7 +253,7 @@ class DiagGaussian(TFActionDistribution):
|
||||
|
||||
@override(TFActionDistribution)
|
||||
def _build_sample_op(self):
|
||||
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||
return self.mean + self.std * tf.random.normal(tf.shape(self.mean))
|
||||
|
||||
@staticmethod
|
||||
@override(ActionDistribution)
|
||||
|
||||
@@ -174,6 +174,7 @@ def build_eager_tf_policy(name,
|
||||
grad_stats_fn=None,
|
||||
extra_learn_fetches_fn=None,
|
||||
extra_action_fetches_fn=None,
|
||||
validate_spaces=None,
|
||||
before_init=None,
|
||||
before_loss_init=None,
|
||||
after_init=None,
|
||||
@@ -208,6 +209,9 @@ def build_eager_tf_policy(name,
|
||||
if get_default_config:
|
||||
config = dict(get_default_config(), **config)
|
||||
|
||||
if validate_spaces:
|
||||
validate_spaces(self, observation_space, action_space, config)
|
||||
|
||||
if before_init:
|
||||
before_init(self, observation_space, action_space, config)
|
||||
|
||||
|
||||
@@ -22,6 +22,7 @@ def build_tf_policy(name,
|
||||
grad_stats_fn=None,
|
||||
extra_action_fetches_fn=None,
|
||||
extra_learn_fetches_fn=None,
|
||||
validate_spaces=None,
|
||||
before_init=None,
|
||||
before_loss_init=None,
|
||||
after_init=None,
|
||||
@@ -73,6 +74,9 @@ def build_tf_policy(name,
|
||||
a dict of TF fetches given the policy object
|
||||
extra_learn_fetches_fn (func): optional function that returns a dict of
|
||||
extra values to fetch and return when learning on a batch
|
||||
validate_spaces (Optional[callable]): Optional callable that takes the
|
||||
Policy, observation_space, action_space, and config to check for
|
||||
correctness.
|
||||
before_init (func): optional function to run at the beginning of
|
||||
policy init that takes the same arguments as the policy constructor
|
||||
before_loss_init (func): optional function to run prior to loss
|
||||
@@ -113,6 +117,9 @@ def build_tf_policy(name,
|
||||
if get_default_config:
|
||||
config = dict(get_default_config(), **config)
|
||||
|
||||
if validate_spaces:
|
||||
validate_spaces(self, obs_space, action_space, config)
|
||||
|
||||
if before_init:
|
||||
before_init(self, obs_space, action_space, config)
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ def build_torch_policy(name,
|
||||
extra_action_out_fn=None,
|
||||
extra_grad_process_fn=None,
|
||||
optimizer_fn=None,
|
||||
validate_spaces=None,
|
||||
before_init=None,
|
||||
after_init=None,
|
||||
action_sampler_fn=None,
|
||||
@@ -48,6 +49,9 @@ def build_torch_policy(name,
|
||||
called after gradients are computed and returns processing info.
|
||||
optimizer_fn (Optional[callable]): Optional callable that returns a
|
||||
torch optimizer given the policy and config.
|
||||
validate_spaces (Optional[callable]): Optional callable that takes the
|
||||
Policy, observation_space, action_space, and config to check for
|
||||
correctness.
|
||||
before_init (Optional[callable]): Optional callable to run at the
|
||||
beginning of `Policy.__init__` that takes the same arguments as
|
||||
the Policy constructor.
|
||||
@@ -94,8 +98,11 @@ def build_torch_policy(name,
|
||||
config = dict(get_default_config(), **config)
|
||||
self.config = config
|
||||
|
||||
if validate_spaces:
|
||||
validate_spaces(self, obs_space, action_space, self.config)
|
||||
|
||||
if before_init:
|
||||
before_init(self, obs_space, action_space, config)
|
||||
before_init(self, obs_space, action_space, self.config)
|
||||
|
||||
# Model is customized (use default action dist class).
|
||||
if make_model:
|
||||
|
||||
@@ -23,3 +23,5 @@ if __name__ == "__main__":
|
||||
|
||||
# Clean up.
|
||||
del os.environ["RLLIB_TEST_NO_TF_IMPORT"]
|
||||
|
||||
print("ok")
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import numpy as np
|
||||
import pytest
|
||||
import time
|
||||
import gym
|
||||
import queue
|
||||
@@ -252,5 +251,6 @@ def test_store_to_replay_actor(ray_start_regular_shared):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
import sys
|
||||
sys.exit(pytest.main(["-v", __file__]))
|
||||
|
||||
@@ -158,6 +158,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(batch["prev_actions"].tolist(),
|
||||
to_prev(batch["actions"]))
|
||||
self.assertGreater(batch["advantages"][0], 1)
|
||||
ev.stop()
|
||||
|
||||
def test_batch_ids(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -170,6 +171,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(len(set(batch2["unroll_id"])), 1)
|
||||
self.assertEqual(
|
||||
len(set(SampleBatch.concat(batch1, batch2)["unroll_id"])), 2)
|
||||
ev.stop()
|
||||
|
||||
def test_global_vars_update(self):
|
||||
# Allow for Unittest run.
|
||||
@@ -202,10 +204,9 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
break
|
||||
self.assertLess(
|
||||
result["info"]["learner"]["default_policy"]["cur_lr"], 0.07)
|
||||
agent.stop()
|
||||
|
||||
def test_no_step_on_init(self):
|
||||
# Allow for Unittest run.
|
||||
ray.init(num_cpus=5, ignore_reinit_error=True)
|
||||
register_env("fail", lambda _: FailOnStepEnv())
|
||||
for fw in framework_iterator(frameworks=()):
|
||||
pg = PGTrainer(
|
||||
@@ -214,6 +215,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
"framework": fw,
|
||||
})
|
||||
self.assertRaises(Exception, lambda: pg.train())
|
||||
pg.stop()
|
||||
|
||||
def test_callbacks(self):
|
||||
for fw in framework_iterator(frameworks=("torch", "tf")):
|
||||
@@ -240,10 +242,9 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertGreater(counts["start"], 0)
|
||||
self.assertGreater(counts["end"], 0)
|
||||
self.assertGreater(counts["step"], 0)
|
||||
pg.stop()
|
||||
|
||||
def test_query_evaluators(self):
|
||||
# Allow for Unittest run.
|
||||
ray.init(num_cpus=5, ignore_reinit_error=True)
|
||||
register_env("test", lambda _: gym.make("CartPole-v0"))
|
||||
for fw in framework_iterator(frameworks=("torch", "tf")):
|
||||
pg = PGTrainer(
|
||||
@@ -263,6 +264,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(results, [10, 10, 10])
|
||||
self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)])
|
||||
self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]])
|
||||
pg.stop()
|
||||
|
||||
def test_reward_clipping(self):
|
||||
# clipping on
|
||||
@@ -274,6 +276,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(max(ev.sample()["rewards"]), 1)
|
||||
result = collect_metrics(ev, [])
|
||||
self.assertEqual(result["episode_reward_mean"], 1000)
|
||||
ev.stop()
|
||||
|
||||
# clipping off
|
||||
ev2 = RolloutWorker(
|
||||
@@ -284,6 +287,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(max(ev2.sample()["rewards"]), 100)
|
||||
result2 = collect_metrics(ev2, [])
|
||||
self.assertEqual(result2["episode_reward_mean"], 1000)
|
||||
ev2.stop()
|
||||
|
||||
def test_hard_horizon(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -302,6 +306,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(np.argmax(samples["obs"][4]), 0)
|
||||
# 3 done values.
|
||||
self.assertEqual(sum(samples["dones"]), 3)
|
||||
ev.stop()
|
||||
|
||||
# A gym env's max_episode_steps is smaller than Trainer's horizon.
|
||||
ev = RolloutWorker(
|
||||
@@ -322,6 +327,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
False, False, False, False, False, True, False, False, False,
|
||||
False, False, True
|
||||
])
|
||||
ev.stop()
|
||||
|
||||
def test_soft_horizon(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -336,10 +342,9 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(len(set(samples["eps_id"])), 3)
|
||||
# only 1 hard done value
|
||||
self.assertEqual(sum(samples["dones"]), 1)
|
||||
ev.stop()
|
||||
|
||||
def test_metrics(self):
|
||||
# Allow for Unittest run.
|
||||
ray.init(num_cpus=5, ignore_reinit_error=True)
|
||||
ev = RolloutWorker(
|
||||
env_creator=lambda _: MockEnv(episode_length=10),
|
||||
policy=MockPolicy,
|
||||
@@ -353,6 +358,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
result = collect_metrics(ev, [remote_ev])
|
||||
self.assertEqual(result["episodes_this_iter"], 20)
|
||||
self.assertEqual(result["episode_reward_mean"], 10)
|
||||
ev.stop()
|
||||
|
||||
def test_async(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -363,6 +369,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
for key in ["obs", "actions", "rewards", "dones", "advantages"]:
|
||||
self.assertIn(key, batch)
|
||||
self.assertGreater(batch["advantages"][0], 1)
|
||||
ev.stop()
|
||||
|
||||
def test_auto_vectorization(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -386,6 +393,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(env.unwrapped.config.worker_index, 0)
|
||||
indices.append(env.unwrapped.config.vector_index)
|
||||
self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
|
||||
ev.stop()
|
||||
|
||||
def test_batches_larger_when_vectorized(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -401,6 +409,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
batch = ev.sample()
|
||||
result = collect_metrics(ev, [])
|
||||
self.assertEqual(result["episodes_this_iter"], 4)
|
||||
ev.stop()
|
||||
|
||||
def test_vector_env_support(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -418,6 +427,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(batch.count, 10)
|
||||
result = collect_metrics(ev, [])
|
||||
self.assertEqual(result["episodes_this_iter"], 8)
|
||||
ev.stop()
|
||||
|
||||
def test_truncate_episodes(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -427,6 +437,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
batch_mode="truncate_episodes")
|
||||
batch = ev.sample()
|
||||
self.assertEqual(batch.count, 15)
|
||||
ev.stop()
|
||||
|
||||
def test_complete_episodes(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -436,6 +447,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
batch_mode="complete_episodes")
|
||||
batch = ev.sample()
|
||||
self.assertEqual(batch.count, 10)
|
||||
ev.stop()
|
||||
|
||||
def test_complete_episodes_packing(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -448,6 +460,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertEqual(
|
||||
batch["t"].tolist(),
|
||||
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
|
||||
ev.stop()
|
||||
|
||||
def test_filter_sync(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -461,6 +474,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
obs_f = filters[DEFAULT_POLICY_ID]
|
||||
self.assertNotEqual(obs_f.rs.n, 0)
|
||||
self.assertNotEqual(obs_f.buffer.n, 0)
|
||||
ev.stop()
|
||||
|
||||
def test_get_filters(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -476,6 +490,7 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
obs_f2 = filters2[DEFAULT_POLICY_ID]
|
||||
self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
|
||||
self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
|
||||
ev.stop()
|
||||
|
||||
def test_sync_filter(self):
|
||||
ev = RolloutWorker(
|
||||
@@ -498,6 +513,23 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
obs_f = filters[DEFAULT_POLICY_ID]
|
||||
self.assertGreaterEqual(obs_f.rs.n, 100)
|
||||
self.assertLessEqual(obs_f.buffer.n, 20)
|
||||
ev.stop()
|
||||
|
||||
def test_extra_python_envs(self):
|
||||
extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"}
|
||||
self.assertFalse("env_key_1" in os.environ)
|
||||
self.assertFalse("env_key_2" in os.environ)
|
||||
ev = RolloutWorker(
|
||||
env_creator=lambda _: MockEnv(10),
|
||||
policy=MockPolicy,
|
||||
extra_python_environs=extra_envs)
|
||||
self.assertTrue("env_key_1" in os.environ)
|
||||
self.assertTrue("env_key_2" in os.environ)
|
||||
ev.stop()
|
||||
|
||||
# reset to original
|
||||
del os.environ["env_key_1"]
|
||||
del os.environ["env_key_2"]
|
||||
|
||||
def sample_and_flush(self, ev):
|
||||
time.sleep(2)
|
||||
@@ -508,21 +540,6 @@ class TestRolloutWorker(unittest.TestCase):
|
||||
self.assertNotEqual(obs_f.buffer.n, 0)
|
||||
return obs_f
|
||||
|
||||
def test_extra_python_envs(self):
|
||||
extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"}
|
||||
self.assertFalse("env_key_1" in os.environ)
|
||||
self.assertFalse("env_key_2" in os.environ)
|
||||
RolloutWorker(
|
||||
env_creator=lambda _: MockEnv(10),
|
||||
policy=MockPolicy,
|
||||
extra_python_environs=extra_envs)
|
||||
self.assertTrue("env_key_1" in os.environ)
|
||||
self.assertTrue("env_key_2" in os.environ)
|
||||
|
||||
# reset to original
|
||||
del os.environ["env_key_1"]
|
||||
del os.environ["env_key_2"]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import pytest
|
||||
|
||||
@@ -305,7 +305,7 @@ class ParameterNoise(Exploration):
|
||||
added_noises.append(
|
||||
tf.assign(
|
||||
noise,
|
||||
tf.random_normal(
|
||||
tf.random.normal(
|
||||
shape=noise.shape,
|
||||
stddev=self.stddev,
|
||||
dtype=tf.float32)))
|
||||
|
||||
@@ -13,7 +13,7 @@ def huber_loss(x, delta=1.0):
|
||||
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
|
||||
return tf.where(
|
||||
tf.abs(x) < delta,
|
||||
tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
|
||||
tf.math.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
|
||||
|
||||
|
||||
def reduce_mean_ignore_inf(x, axis):
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from typing import Any, Dict, Union, Tuple
|
||||
from typing import Any, Dict, List, Tuple, Union
|
||||
import gym
|
||||
|
||||
# Represents a fully filled out config of a Trainer class.
|
||||
@@ -77,3 +77,6 @@ TensorType = Any
|
||||
|
||||
# Either a plain tensor, or a dict or tuple of tensors (or StructTensors).
|
||||
TensorStructType = Union[TensorType, dict, tuple]
|
||||
|
||||
# A shape of a tensor.
|
||||
TensorShape = Union[Tuple[int], List[int]]
|
||||
|
||||
Reference in New Issue
Block a user