[RLlib] Minor cleanup in preparation to tf2.x support. (#9130)

* WIP.

* Fixes.

* LINT.

* Fixes.

* Fixes and LINT.

* WIP.
This commit is contained in:
Sven Mika
2020-06-25 19:01:32 +02:00
committed by GitHub
parent aa3fd62cac
commit 4fd8977eaf
37 changed files with 347 additions and 176 deletions
+2 -2
View File
@@ -42,8 +42,8 @@ class TensorFlowVariables:
Args:
output (tf.Operation, List[tf.Operation]): The tensorflow
operation to extract all variables from.
sess (tf.Session): Session used for running the get and set
methods.
sess (Optional[tf.Session]): Optional tf.Session used for running
the get and set methods in tf graph mode.
input_variables (List[tf.Variables]): Variables to include in the
list.
"""
+11 -10
View File
@@ -496,7 +496,7 @@ py_test(
"agents/ppo/tests/test.py"] # TODO(sven): Move down once PR 6889 merged
)
# DDPPO
# PPO: DDPPO
py_test(
name = "test_ddppo",
tags = ["agents_dir"],
@@ -504,7 +504,7 @@ py_test(
srcs = ["agents/ppo/tests/test_ddppo.py"]
)
# APPO
# PPO: APPO
py_test(
name = "test_appo",
tags = ["agents_dir"],
@@ -512,7 +512,15 @@ py_test(
srcs = ["agents/ppo/tests/test_appo.py"]
)
# SAC
# QMixTrainer
py_test(
name = "test_qmix",
tags = ["agents_dir"],
size = "medium",
srcs = ["agents/qmix/tests/test_qmix.py"]
)
# SACTrainer
py_test(
name = "test_sac",
tags = ["agents_dir"],
@@ -1103,13 +1111,6 @@ py_test(
srcs = ["tests/test_attention_net_learning.py"]
)
py_test(
name = "tests/test_avail_actions_qmix",
tags = ["tests_dir", "tests_dir_A"],
size = "medium",
srcs = ["tests/test_avail_actions_qmix.py"]
)
py_test(
name = "tests/test_catalog",
tags = ["tests_dir", "tests_dir_C"],
+4 -3
View File
@@ -27,7 +27,7 @@ class A3CLoss:
self.pi_loss = -tf.reduce_sum(log_prob * advantages)
delta = vf - v_target
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
self.entropy = tf.reduce_sum(action_dist.entropy())
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
self.entropy * entropy_coeff)
@@ -90,14 +90,15 @@ def stats(policy, train_batch):
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"policy_entropy": policy.loss.entropy,
"var_gnorm": tf.global_norm(list(policy.model.trainable_variables())),
"var_gnorm": tf.linalg.global_norm(
list(policy.model.trainable_variables())),
"vf_loss": policy.loss.vf_loss,
}
def grad_stats(policy, train_batch, grads):
return {
"grad_gnorm": tf.global_norm(grads),
"grad_gnorm": tf.linalg.global_norm(grads),
"vf_explained_var": explained_variance(
train_batch[Postprocessing.VALUE_TARGETS],
policy.model.value_function()),
+7 -2
View File
@@ -157,6 +157,12 @@ DEFAULT_CONFIG = with_common_config({
def validate_config(config):
if config["model"]["custom_model"]:
logger.warning(
"Setting use_state_preprocessor=True since a custom model "
"was specified.")
config["use_state_preprocessor"] = True
# TODO(sven): Remove at some point.
# Backward compatibility of noise-based exploration config.
schedule_max_timesteps = None
@@ -191,8 +197,7 @@ def validate_config(config):
if config.get("parameter_noise", DEPRECATED_VALUE) != DEPRECATED_VALUE:
deprecation_warning("parameter_noise", "exploration_config={"
"type=ParameterNoise"
"}")
"type=ParameterNoise}")
if config["exploration_config"]["type"] == "ParameterNoise":
if config["batch_mode"] != "complete_episodes":
+21 -21
View File
@@ -15,9 +15,9 @@ from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_action_dist import Deterministic
from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
from ray.rllib.utils.annotations import override
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.policy.tf_policy import TFPolicy
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \
make_tf_callable
@@ -36,22 +36,6 @@ TWIN_Q_TARGET_SCOPE = "twin_target_critic"
def build_ddpg_models(policy, observation_space, action_space, config):
if config["model"]["custom_model"]:
logger.warning(
"Setting use_state_preprocessor=True since a custom model "
"was specified.")
config["use_state_preprocessor"] = True
if not isinstance(action_space, Box):
raise UnsupportedSpaceException(
"Action space {} is not supported for DDPG.".format(action_space))
elif len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space has multiple dimensions "
"{}. ".format(action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a Tuple action space, or the multi-agent API.")
if policy.config["use_state_preprocessor"]:
default_model = None # catalog decides
num_outputs = 256 # arbitrary
@@ -157,7 +141,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
if policy.config["smooth_target_policy"]:
target_noise_clip = policy.config["target_noise_clip"]
clipped_normal_sample = tf.clip_by_value(
tf.random_normal(
tf.random.normal(
tf.shape(policy_tp1),
stddev=policy.config["target_noise"]), -target_noise_clip,
target_noise_clip)
@@ -219,15 +203,17 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
errors = huber_loss(td_error, huber_threshold) + \
huber_loss(twin_td_error, huber_threshold)
else:
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(twin_td_error)
errors = 0.5 * tf.math.square(td_error) + \
0.5 * tf.math.square(twin_td_error)
else:
td_error = q_t_selected - q_t_selected_target
if use_huber:
errors = huber_loss(td_error, huber_threshold)
else:
errors = 0.5 * tf.square(td_error)
errors = 0.5 * tf.math.square(td_error)
critic_loss = tf.reduce_mean(train_batch[PRIO_WEIGHTS] * errors)
critic_loss = tf.reduce_mean(
tf.cast(train_batch[PRIO_WEIGHTS], tf.float32) * errors)
actor_loss = -tf.reduce_mean(q_t_det_policy)
# Add l2-regularization if required.
@@ -417,6 +403,19 @@ def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, config)
def validate_spaces(pid, observation_space, action_space, config):
if not isinstance(action_space, Box):
raise UnsupportedSpaceException(
"Action space ({}) of {} is not supported for "
"DDPG.".format(action_space, pid))
elif len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space ({}) of {} has multiple dimensions "
"{}. ".format(action_space, pid, action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a Tuple action space, or the multi-agent API.")
DDPGTFPolicy = build_tf_policy(
name="DDPGTFPolicy",
get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
@@ -429,6 +428,7 @@ DDPGTFPolicy = build_tf_policy(
gradients_fn=gradients_fn,
apply_gradients_fn=build_apply_op,
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
validate_spaces=validate_spaces,
before_init=before_init_fn,
before_loss_init=setup_mid_mixins,
after_init=setup_late_mixins,
+2 -1
View File
@@ -2,7 +2,7 @@ import logging
import ray
from ray.rllib.agents.ddpg.ddpg_tf_policy import build_ddpg_models, \
get_distribution_inputs_and_class
get_distribution_inputs_and_class, validate_spaces
from ray.rllib.agents.dqn.dqn_tf_policy import postprocess_nstep_and_prio, \
PRIO_WEIGHTS
from ray.rllib.models.torch.torch_action_dist import TorchDeterministic
@@ -269,6 +269,7 @@ DDPGTorchPolicy = build_torch_policy(
postprocess_fn=postprocess_nstep_and_prio,
extra_grad_process_fn=gradients_fn,
optimizer_fn=make_ddpg_optimizers,
validate_spaces=validate_spaces,
before_init=before_init_fn,
after_init=setup_late_mixins,
action_distribution_fn=get_distribution_inputs_and_class,
@@ -234,8 +234,8 @@ class DistributionalQTFModel(TFModelV2):
"""
in_size = int(action_in.shape[1])
epsilon_in = tf.random_normal(shape=[in_size])
epsilon_out = tf.random_normal(shape=[out_size])
epsilon_in = tf.random.normal(shape=[in_size])
epsilon_out = tf.random.normal(shape=[out_size])
epsilon_in = self._f_epsilon(epsilon_in)
epsilon_out = self._f_epsilon(epsilon_out)
epsilon_w = tf.matmul(
@@ -279,4 +279,4 @@ class DistributionalQTFModel(TFModelV2):
return tf.nn.relu(action_activation)
def _f_epsilon(self, x):
return tf.sign(x) * tf.sqrt(tf.abs(x))
return tf.math.sign(x) * tf.math.sqrt(tf.math.abs(x))
+2 -2
View File
@@ -54,11 +54,11 @@ class QLoss:
r_tau = tf.clip_by_value(r_tau, v_min, v_max)
b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
lb = tf.floor(b)
ub = tf.ceil(b)
ub = tf.math.ceil(b)
# indispensable judgement which is missed in most implementations
# when b happens to be an integer, lb == ub, so pr_j(s', a*) will
# be discarded because (ub-b) == (b-lb) == 0
floor_equal_ceil = tf.to_float(tf.less(ub - lb, 0.5))
floor_equal_ceil = tf.cast(tf.less(ub - lb, 0.5), tf.float32)
l_project = tf.one_hot(
tf.cast(lb, dtype=tf.int32),
+1 -1
View File
@@ -53,7 +53,7 @@ def build_q_losses(policy, model, dist_class, train_batch):
is_training=True)
# q scores for actions which we know were selected in the given state.
one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS],
one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS].long(),
policy.action_space.n)
q_t_selected = torch.sum(q_t * one_hot_selection, 1)
+1 -1
View File
@@ -50,7 +50,7 @@ class DYNATorchModel(TorchModelV2, nn.Module):
# One-hot the actions.
actions_flat = nn.functional.one_hot(
actions, num_classes=self.action_space.n).float()
actions.long(), num_classes=self.action_space.n).float()
# Push through our underlying Model.
next_obs, _ = self.forward({
"obs_flat": torch.cat([observations, actions_flat], -1)
+4 -4
View File
@@ -80,7 +80,7 @@ class VTraceLoss:
behaviour_policy_logits=behaviour_logits,
target_policy_logits=target_logits,
actions=tf.unstack(actions, axis=2),
discounts=tf.to_float(~dones) * discount,
discounts=tf.cast(~dones, tf.float32) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
@@ -98,7 +98,7 @@ class VTraceLoss:
# The baseline loss.
delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask)
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.vf_loss = 0.5 * tf.reduce_sum(tf.math.square(delta))
# The entropy loss.
self.entropy = tf.reduce_sum(
@@ -228,7 +228,7 @@ def stats(policy, train_batch):
"policy_loss": policy.loss.pi_loss,
"entropy": policy.loss.entropy,
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
"var_gnorm": tf.global_norm(policy.model.trainable_variables()),
"var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()),
"vf_loss": policy.loss.vf_loss,
"vf_explained_var": explained_variance(
tf.reshape(policy.loss.value_targets, [-1]),
@@ -238,7 +238,7 @@ def stats(policy, train_batch):
def grad_stats(policy, train_batch, grads):
return {
"grad_gnorm": tf.global_norm(grads),
"grad_gnorm": tf.linalg.global_norm(grads),
}
+5 -5
View File
@@ -28,7 +28,7 @@ class ValueNetworkMixin:
class ValueLoss:
def __init__(self, state_values, cumulative_rewards):
self.loss = 0.5 * tf.reduce_mean(
tf.square(state_values - cumulative_rewards))
tf.math.square(state_values - cumulative_rewards))
class ReweightedImitationLoss:
@@ -39,13 +39,13 @@ class ReweightedImitationLoss:
# update averaged advantage norm
update_adv_norm = tf.assign_add(
ref=policy._ma_adv_norm,
value=1e-6 *
(tf.reduce_mean(tf.square(adv)) - policy._ma_adv_norm))
value=1e-6 * (
tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))
# exponentially weighted advantages
with tf.control_dependencies([update_adv_norm]):
exp_advs = tf.exp(
beta * tf.divide(adv, 1e-8 + tf.sqrt(policy._ma_adv_norm)))
exp_advs = tf.math.exp(beta * tf.math.divide(
adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))
# log\pi_\theta(a|s)
logprobs = action_dist.logp(actions)
+4 -4
View File
@@ -78,7 +78,7 @@ class PPOSurrogateLoss:
# The baseline loss
delta = values - value_targets
self.value_targets = value_targets
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
self.vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
# The entropy loss
self.entropy = reduce_mean_valid(actions_entropy)
@@ -159,7 +159,7 @@ class VTraceSurrogateLoss:
behaviour_policy_logits=behaviour_logits,
target_policy_logits=old_policy_behaviour_logits,
actions=tf.unstack(actions, axis=2),
discounts=tf.to_float(~dones) * discount,
discounts=tf.cast(~dones, tf.float32) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
@@ -185,7 +185,7 @@ class VTraceSurrogateLoss:
# The baseline loss
delta = values - self.vtrace_returns.vs
self.value_targets = self.vtrace_returns.vs
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
self.vf_loss = 0.5 * reduce_mean_valid(tf.math.square(delta))
# The entropy loss
self.entropy = reduce_mean_valid(actions_entropy)
@@ -350,7 +350,7 @@ def stats(policy, train_batch):
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"entropy": policy.loss.entropy,
"var_gnorm": tf.global_norm(policy.model.trainable_variables()),
"var_gnorm": tf.linalg.global_norm(policy.model.trainable_variables()),
"vf_loss": policy.loss.vf_loss,
"vf_explained_var": explained_variance(
tf.reshape(policy.loss.value_targets, [-1]),
+2 -2
View File
@@ -89,10 +89,10 @@ class PPOLoss:
self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
if use_gae:
vf_loss1 = tf.square(value_fn - value_targets)
vf_loss1 = tf.math.square(value_fn - value_targets)
vf_clipped = vf_preds + tf.clip_by_value(
value_fn - vf_preds, -vf_clip_param, vf_clip_param)
vf_loss2 = tf.square(vf_clipped - value_targets)
vf_loss2 = tf.math.square(vf_clipped - value_targets)
vf_loss = tf.maximum(vf_loss1, vf_loss2)
self.mean_vf_loss = reduce_mean_valid(vf_loss)
loss = reduce_mean_valid(
+11 -1
View File
@@ -1,8 +1,12 @@
import logging
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.sac.sac_tf_policy import SACTFPolicy
from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE
logger = logging.getLogger(__name__)
OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "prioritized_replay_eps",
@@ -131,6 +135,12 @@ def get_policy_class(config):
def validate_config(config):
if config["model"].get("custom_model"):
logger.warning(
"Setting use_state_preprocessor=True since a custom model "
"was specified.")
config["use_state_preprocessor"] = True
if config.get("grad_norm_clipping", DEPRECATED_VALUE) != DEPRECATED_VALUE:
deprecation_warning("grad_norm_clipping", "grad_clip")
config["grad_clip"] = config.pop("grad_norm_clipping")
@@ -154,7 +164,7 @@ def validate_config(config):
SACTrainer = GenericOffPolicyTrainer.with_updates(
name="SAC",
default_config=DEFAULT_CONFIG,
validate_config=validate_config,
default_policy=SACTFPolicy,
get_policy_class=get_policy_class,
validate_config=validate_config,
)
+14 -15
View File
@@ -24,21 +24,6 @@ logger = logging.getLogger(__name__)
def build_sac_model(policy, obs_space, action_space, config):
if config["model"].get("custom_model"):
logger.warning(
"Setting use_state_preprocessor=True since a custom model "
"was specified.")
config["use_state_preprocessor"] = True
if not isinstance(action_space, (Box, Discrete)):
raise UnsupportedSpaceException(
"Action space {} is not supported for SAC.".format(action_space))
if isinstance(action_space, Box) and len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space has multiple dimensions "
"{}. ".format(action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a Tuple action space, or the multi-agent API.")
# 2 cases:
# 1) with separate state-preprocessor (before obs+action concat).
# 2) no separate state-preprocessor: concat obs+actions right away.
@@ -425,6 +410,19 @@ def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, config)
def validate_spaces(pid, observation_space, action_space, config):
if not isinstance(action_space, (Box, Discrete)):
raise UnsupportedSpaceException(
"Action space ({}) of {} is not supported for "
"SAC.".format(action_space, pid))
if isinstance(action_space, Box) and len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space ({}) of {} has multiple dimensions "
"{}. ".format(action_space, pid, action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a Tuple action space, or the multi-agent API.")
SACTFPolicy = build_tf_policy(
name="SACTFPolicy",
get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
@@ -439,6 +437,7 @@ SACTFPolicy = build_tf_policy(
mixins=[
TargetNetworkMixin, ActorCriticOptimizerMixin, ComputeTDErrorMixin
],
validate_spaces=validate_spaces,
before_init=setup_early_mixins,
before_loss_init=setup_mid_mixins,
after_init=setup_late_mixins,
+2 -1
View File
@@ -5,7 +5,7 @@ import ray
import ray.experimental.tf_utils
from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
from ray.rllib.agents.sac.sac_tf_policy import build_sac_model, \
postprocess_trajectory
postprocess_trajectory, validate_spaces
from ray.rllib.agents.dqn.dqn_tf_policy import PRIO_WEIGHTS
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_policy_template import build_torch_policy
@@ -336,6 +336,7 @@ SACTorchPolicy = build_torch_policy(
postprocess_fn=postprocess_trajectory,
extra_grad_process_fn=apply_grad_clipping,
optimizer_fn=optimizer_fn,
validate_spaces=validate_spaces,
after_init=setup_late_mixins,
make_model_and_action_dist=build_sac_model_and_action_dist,
mixins=[TargetNetworkMixin, ComputeTDErrorMixin],
+3 -2
View File
@@ -68,6 +68,7 @@ class TestSAC(unittest.TestCase):
results = trainer.train()
print(results)
check_compute_single_action(trainer)
trainer.stop()
def test_sac_loss_function(self):
"""Tests SAC loss function results across all frameworks."""
@@ -164,7 +165,7 @@ class TestSAC(unittest.TestCase):
# Set all weights (of all nets) to fixed values.
if weights_dict is None:
assert fw == "tf" # Start with the tf vars-dict.
assert fw in ["tf", "tfe"] # Start with the tf vars-dict.
weights_dict = policy.get_weights()
else:
assert fw == "torch" # Then transfer that to torch Model.
@@ -176,7 +177,7 @@ class TestSAC(unittest.TestCase):
if fw == "tf":
log_alpha = weights_dict["default_policy/log_alpha"]
elif fw == "torch":
# Actually convert to torch tensors.
# Actually convert to torch tensors (by accessing everything).
input_ = policy._lazy_tensor_dict(input_)
input_ = {k: input_[k] for k in input_.keys()}
log_alpha = policy.model.log_alpha.detach().numpy()[0]
+11 -13
View File
@@ -7,7 +7,7 @@ import os
import pickle
import time
import tempfile
from typing import Callable, List, Dict, Union, Any
from typing import Callable, List, Dict, Union
import ray
from ray.exceptions import RayError
@@ -701,9 +701,6 @@ class Trainer(Trainable):
config (dict): The Trainer's config.
num_workers (int): Number of remote rollout workers to create.
0 for local only.
remote_config_updates (Optional[List[dict]]): A list of config
dicts to update `config` with for each Worker (len must be
same as `num_workers`).
Returns:
WorkerSet: The created WorkerSet.
@@ -778,9 +775,9 @@ class Trainer(Trainable):
@PublicAPI
def compute_action(self,
observation: TensorStructType,
state: List[Any] = None,
state: List[TensorStructType] = None,
prev_action: TensorStructType = None,
prev_reward: int = None,
prev_reward: float = None,
info: EnvInfoDict = None,
policy_id: PolicyID = DEFAULT_POLICY_ID,
full_fetch: bool = False,
@@ -791,16 +788,17 @@ class Trainer(Trainable):
self.get_policy(policy_id) and call compute_actions() on it directly.
Arguments:
observation (obj): observation from the environment.
state (list): RNN hidden state, if any. If state is not None,
then all of compute_single_action(...) is returned
observation (TensorStructType): observation from the environment.
state (List[TensorStructType]): RNN hidden state, if any. If state
is not None, then all of compute_single_action(...) is returned
(computed action, rnn state(s), logits dictionary).
Otherwise compute_single_action(...)[0] is returned
(computed action).
prev_action (obj): previous action value, if any
prev_reward (int): previous reward, if any
info (dict): info object, if any
policy_id (str): Policy to query (only applies to multi-agent).
prev_action (TensorStructType): Previous action value, if any.
prev_reward (float): Previous reward, if any.
info (EnvInfoDict): info object, if any
policy_id (PolicyID): Policy to query (only applies to
multi-agent).
full_fetch (bool): Whether to return extra action fetch results.
This is always set to True if RNN state is specified.
explore (bool): Whether to pick an exploitation or exploration
+28 -26
View File
@@ -1,6 +1,6 @@
from typing import Callable, Optional, List, Iterable
import logging
import time
from typing import Callable, Optional, List, Iterable
from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
from ray.rllib.evaluation.worker_set import WorkerSet
@@ -34,20 +34,21 @@ def default_execution_plan(workers: WorkerSet, config: TrainerConfigDict):
@DeveloperAPI
def build_trainer(
name: str,
default_policy: Optional[Policy],
default_config: TrainerConfigDict = None,
validate_config: Callable[[TrainerConfigDict], None] = None,
get_initial_state=None, # DEPRECATED
get_policy_class: Callable[[TrainerConfigDict], Policy] = None,
before_init: Callable[[Trainer], None] = None,
make_workers=None, # DEPRECATED
make_policy_optimizer=None, # DEPRECATED
after_init: Callable[[Trainer], None] = None,
before_train_step=None, # DEPRECATED
after_optimizer_step=None, # DEPRECATED
after_train_result=None, # DEPRECATED
collect_metrics_fn=None, # DEPRECATED
name: str,
default_policy: Optional[Policy],
*,
default_config: TrainerConfigDict = None,
validate_config: Callable[[TrainerConfigDict], None] = None,
get_initial_state=None, # DEPRECATED
get_policy_class: Callable[[TrainerConfigDict], Policy] = None,
before_init: Callable[[Trainer], None] = None,
make_workers=None, # DEPRECATED
make_policy_optimizer=None, # DEPRECATED
after_init: Callable[[Trainer], None] = None,
before_train_step=None, # DEPRECATED
after_optimizer_step=None, # DEPRECATED
after_train_result=None, # DEPRECATED
collect_metrics_fn=None, # DEPRECATED
before_evaluate_fn: Callable[[Trainer], None] = None,
mixins: List[type] = None,
execution_plan: Callable[[WorkerSet, TrainerConfigDict], Iterable[
@@ -64,19 +65,20 @@ def build_trainer(
default_policy (cls): the default Policy class to use
default_config (dict): The default config dict of the algorithm,
otherwise uses the Trainer default config.
validate_config (func): optional callback that checks a given config
for correctness. It may mutate the config as needed.
get_policy_class (func): optional callback that takes a config and
returns the policy class to override the default with
before_init (func): optional function to run at the start of trainer
init that takes the trainer instance as argument
after_init (func): optional function to run at the end of trainer init
that takes the trainer instance as argument
before_evaluate_fn (func): callback to run before evaluation. This
takes the trainer instance as argument.
validate_config (Optional[callable]): Optional callable that takes the
config to check for correctness. It may mutate the config as
needed.
get_policy_class (Optional[callable]): Optional callable that takes a
config and returns the policy class to override the default with.
before_init (Optional[callable]): Optional callable to run at the start
of trainer init that takes the trainer instance as argument.
after_init (Optional[callable]): Optional callable to run at the end of
trainer init that takes the trainer instance as argument.
before_evaluate_fn (Optional[callable]): callback to run before
evaluation. This takes the trainer instance as argument.
mixins (list): list of any class mixins for the returned trainer class.
These mixins will be applied in order and will have higher
precedence than the Trainer class
precedence than the Trainer class.
execution_plan (func): Setup the distributed execution workflow.
Returns:
@@ -11,29 +11,29 @@ class BinaryAutoregressiveDistribution(ActionDistribution):
"""Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
def deterministic_sample(self):
# first, sample a1
# First, sample a1.
a1_dist = self._a1_distribution()
a1 = a1_dist.deterministic_sample()
# sample a2 conditioned on a1
# Sample a2 conditioned on a1.
a2_dist = self._a2_distribution(a1)
a2 = a2_dist.deterministic_sample()
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
# return the action tuple
# Return the action tuple.
return (a1, a2)
def sample(self):
# first, sample a1
# First, sample a1.
a1_dist = self._a1_distribution()
a1 = a1_dist.sample()
# sample a2 conditioned on a1
# Sample a2 conditioned on a1.
a2_dist = self._a2_distribution(a1)
a2 = a2_dist.sample()
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
# return the action tuple
# Return the action tuple.
return (a1, a2)
def logp(self, actions):
@@ -81,29 +81,29 @@ class TorchBinaryAutoregressiveDistribution(TorchDistributionWrapper):
"""Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
def deterministic_sample(self):
# first, sample a1
# First, sample a1.
a1_dist = self._a1_distribution()
a1 = a1_dist.deterministic_sample()
# sample a2 conditioned on a1
# Sample a2 conditioned on a1.
a2_dist = self._a2_distribution(a1)
a2 = a2_dist.deterministic_sample()
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
# return the action tuple
# Return the action tuple.
return (a1, a2)
def sample(self):
# first, sample a1
# First, sample a1.
a1_dist = self._a1_distribution()
a1 = a1_dist.sample()
# sample a2 conditioned on a1
# Sample a2 conditioned on a1.
a2_dist = self._a2_distribution(a1)
a2 = a2_dist.sample()
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
# return the action tuple
# Return the action tuple.
return (a1, a2)
def logp(self, actions):
@@ -56,7 +56,7 @@ class ParametricActionsModel(DistributionalQTFModel):
action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)
# Mask out invalid actions (use tf.float32.min for stability)
inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
inf_mask = tf.maximum(tf.math.log(action_mask), tf.float32.min)
return action_logits + inf_mask, state
def value_function(self):
+2 -1
View File
@@ -416,7 +416,8 @@ class ModelCatalog:
name, **model_kwargs)
else:
raise NotImplementedError(
"Framework must be 'tf' or 'torch': {}".format(framework))
"`framework` must be 'tf|tfe|torch', but is "
"{}!".format(framework))
@staticmethod
@DeveloperAPI
+1 -2
View File
@@ -1,7 +1,6 @@
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet import FullyConnectedNetwork
from ray.rllib.models.tf.recurrent_net import \
RecurrentNetwork
from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
from ray.rllib.models.tf.visionnet import VisionNetwork
__all__ = [
+6 -2
View File
@@ -1,10 +1,14 @@
from ray.rllib.models.tf.layers.gru_gate import GRUGate
from ray.rllib.models.tf.layers.noisy_layer import NoisyLayer
from ray.rllib.models.tf.layers.relative_multi_head_attention import \
RelativeMultiHeadAttention
from ray.rllib.models.tf.layers.skip_connection import SkipConnection
from ray.rllib.models.tf.layers.multi_head_attention import MultiHeadAttention
__all__ = [
"GRUGate", "RelativeMultiHeadAttention", "SkipConnection",
"MultiHeadAttention"
"GRUGate",
"MultiHeadAttention",
"NoisyLayer",
"RelativeMultiHeadAttention",
"SkipConnection"
]
+105
View File
@@ -0,0 +1,105 @@
import numpy as np
from ray.rllib.utils.framework import get_activation_fn, get_variable, \
try_import_tf
tf = try_import_tf()
class NoisyLayer(tf.keras.layers.Layer):
"""A Layer that adds learnable Noise
a common dense layer: y = w^{T}x + b
a noisy layer: y = (w + \\epsilon_w*\\sigma_w)^{T}x +
(b+\\epsilon_b*\\sigma_b)
where \epsilon are random variables sampled from factorized normal
distributions and \\sigma are trainable variables which are expected to
vanish along the training procedure
"""
def __init__(self,
prefix,
out_size,
sigma0,
activation="relu"):
"""Initializes a NoisyLayer object.
Args:
prefix:
out_size:
sigma0:
non_linear:
"""
super().__init__()
self.prefix = prefix
self.out_size = out_size
# TF noise generation can be unreliable on GPU
# If generating the noise on the CPU,
# lowering sigma0 to 0.1 may be helpful
self.sigma0 = sigma0 # 0.5~GPU, 0.1~CPU
self.activation = activation
# Variables.
self.w = None # Weight matrix.
self.b = None # Biases.
self.sigma_w = None # Noise for weight matrix
self.sigma_b = None # Noise for biases.
def build(self, input_shape):
in_size = int(input_shape[1])
self.sigma_w = get_variable(
value=tf.keras.initializers.RandomUniform(
minval=-1.0 / np.sqrt(float(in_size)),
maxval=1.0 / np.sqrt(float(in_size))),
trainable=True,
tf_name=self.prefix + "_sigma_w",
shape=[in_size, self.out_size],
dtype=tf.float32
)
self.sigma_b = get_variable(
value=tf.keras.initializers.Constant(
self.sigma0 / np.sqrt(float(in_size))),
trainable=True,
tf_name=self.prefix + "_sigma_b",
shape=[self.out_size],
dtype=tf.float32,
)
self.w = get_variable(
value=tf.keras.initializers.GlorotUniform(),
tf_name=self.prefix + "_fc_w",
trainable=True,
shape=[in_size, self.out_size],
dtype=tf.float32,
)
self.b = get_variable(
value=tf.keras.initializers.Zeros(),
tf_name=self.prefix + "_fc_b",
trainable=True,
shape=[self.out_size],
dtype=tf.float32,
)
def call(self, inputs):
in_size = int(inputs.shape[1])
epsilon_in = tf.random.normal(shape=[in_size])
epsilon_out = tf.random.normal(shape=[self.out_size])
epsilon_in = self._f_epsilon(epsilon_in)
epsilon_out = self._f_epsilon(epsilon_out)
epsilon_w = tf.matmul(
a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
epsilon_b = epsilon_out
action_activation = tf.matmul(
inputs,
self.w + self.sigma_w * epsilon_w) + \
self.b + self.sigma_b * epsilon_b
fn = get_activation_fn(self.activation, framework="tf")
if fn is not None:
action_activation = fn(action_activation)
return action_activation
def _f_epsilon(self, x):
return tf.math.sign(x) * tf.math.sqrt(tf.math.abs(x))
+16 -13
View File
@@ -65,22 +65,23 @@ class Categorical(TFActionDistribution):
@override(ActionDistribution)
def entropy(self):
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keep_dims=True)
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, axis=1, keep_dims=True)
z0 = tf.reduce_sum(ea0, axis=1, keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=1)
return tf.reduce_sum(p0 * (tf.math.log(z0) - a0), axis=1)
@override(ActionDistribution)
def kl(self, other):
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keep_dims=True)
a1 = other.inputs - tf.reduce_max(other.inputs, axis=1, keep_dims=True)
a0 = self.inputs - tf.reduce_max(self.inputs, axis=1, keepdims=True)
a1 = other.inputs - tf.reduce_max(other.inputs, axis=1, keepdims=True)
ea0 = tf.exp(a0)
ea1 = tf.exp(a1)
z0 = tf.reduce_sum(ea0, axis=1, keep_dims=True)
z1 = tf.reduce_sum(ea1, axis=1, keep_dims=True)
z0 = tf.reduce_sum(ea0, axis=1, keepdims=True)
z1 = tf.reduce_sum(ea1, axis=1, keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), axis=1)
return tf.reduce_sum(
p0 * (a0 - tf.math.log(z0) - a1 + tf.math.log(z1)), axis=1)
@override(TFActionDistribution)
def _build_sample_op(self):
@@ -230,8 +231,9 @@ class DiagGaussian(TFActionDistribution):
@override(ActionDistribution)
def logp(self, x):
return -0.5 * tf.reduce_sum(
tf.square((tf.to_float(x) - self.mean) / self.std), axis=1) - \
0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) - \
tf.math.square((tf.cast(x, tf.float32) - self.mean) / self.std),
axis=1
) - 0.5 * np.log(2.0 * np.pi) * tf.cast(tf.shape(x)[1], tf.float32) - \
tf.reduce_sum(self.log_std, axis=1)
@override(ActionDistribution)
@@ -239,8 +241,9 @@ class DiagGaussian(TFActionDistribution):
assert isinstance(other, DiagGaussian)
return tf.reduce_sum(
other.log_std - self.log_std +
(tf.square(self.std) + tf.square(self.mean - other.mean)) /
(2.0 * tf.square(other.std)) - 0.5,
(tf.math.square(self.std) +
tf.math.square(self.mean - other.mean)) /
(2.0 * tf.math.square(other.std)) - 0.5,
axis=1)
@override(ActionDistribution)
@@ -250,7 +253,7 @@ class DiagGaussian(TFActionDistribution):
@override(TFActionDistribution)
def _build_sample_op(self):
return self.mean + self.std * tf.random_normal(tf.shape(self.mean))
return self.mean + self.std * tf.random.normal(tf.shape(self.mean))
@staticmethod
@override(ActionDistribution)
+4
View File
@@ -174,6 +174,7 @@ def build_eager_tf_policy(name,
grad_stats_fn=None,
extra_learn_fetches_fn=None,
extra_action_fetches_fn=None,
validate_spaces=None,
before_init=None,
before_loss_init=None,
after_init=None,
@@ -208,6 +209,9 @@ def build_eager_tf_policy(name,
if get_default_config:
config = dict(get_default_config(), **config)
if validate_spaces:
validate_spaces(self, observation_space, action_space, config)
if before_init:
before_init(self, observation_space, action_space, config)
+7
View File
@@ -22,6 +22,7 @@ def build_tf_policy(name,
grad_stats_fn=None,
extra_action_fetches_fn=None,
extra_learn_fetches_fn=None,
validate_spaces=None,
before_init=None,
before_loss_init=None,
after_init=None,
@@ -73,6 +74,9 @@ def build_tf_policy(name,
a dict of TF fetches given the policy object
extra_learn_fetches_fn (func): optional function that returns a dict of
extra values to fetch and return when learning on a batch
validate_spaces (Optional[callable]): Optional callable that takes the
Policy, observation_space, action_space, and config to check for
correctness.
before_init (func): optional function to run at the beginning of
policy init that takes the same arguments as the policy constructor
before_loss_init (func): optional function to run prior to loss
@@ -113,6 +117,9 @@ def build_tf_policy(name,
if get_default_config:
config = dict(get_default_config(), **config)
if validate_spaces:
validate_spaces(self, obs_space, action_space, config)
if before_init:
before_init(self, obs_space, action_space, config)
+8 -1
View File
@@ -20,6 +20,7 @@ def build_torch_policy(name,
extra_action_out_fn=None,
extra_grad_process_fn=None,
optimizer_fn=None,
validate_spaces=None,
before_init=None,
after_init=None,
action_sampler_fn=None,
@@ -48,6 +49,9 @@ def build_torch_policy(name,
called after gradients are computed and returns processing info.
optimizer_fn (Optional[callable]): Optional callable that returns a
torch optimizer given the policy and config.
validate_spaces (Optional[callable]): Optional callable that takes the
Policy, observation_space, action_space, and config to check for
correctness.
before_init (Optional[callable]): Optional callable to run at the
beginning of `Policy.__init__` that takes the same arguments as
the Policy constructor.
@@ -94,8 +98,11 @@ def build_torch_policy(name,
config = dict(get_default_config(), **config)
self.config = config
if validate_spaces:
validate_spaces(self, obs_space, action_space, self.config)
if before_init:
before_init(self, obs_space, action_space, config)
before_init(self, obs_space, action_space, self.config)
# Model is customized (use default action dist class).
if make_model:
+2
View File
@@ -23,3 +23,5 @@ if __name__ == "__main__":
# Clean up.
del os.environ["RLLIB_TEST_NO_TF_IMPORT"]
print("ok")
+1 -1
View File
@@ -1,5 +1,4 @@
import numpy as np
import pytest
import time
import gym
import queue
@@ -252,5 +251,6 @@ def test_store_to_replay_actor(ray_start_regular_shared):
if __name__ == "__main__":
import pytest
import sys
sys.exit(pytest.main(["-v", __file__]))
+38 -21
View File
@@ -158,6 +158,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(batch["prev_actions"].tolist(),
to_prev(batch["actions"]))
self.assertGreater(batch["advantages"][0], 1)
ev.stop()
def test_batch_ids(self):
ev = RolloutWorker(
@@ -170,6 +171,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(len(set(batch2["unroll_id"])), 1)
self.assertEqual(
len(set(SampleBatch.concat(batch1, batch2)["unroll_id"])), 2)
ev.stop()
def test_global_vars_update(self):
# Allow for Unittest run.
@@ -202,10 +204,9 @@ class TestRolloutWorker(unittest.TestCase):
break
self.assertLess(
result["info"]["learner"]["default_policy"]["cur_lr"], 0.07)
agent.stop()
def test_no_step_on_init(self):
# Allow for Unittest run.
ray.init(num_cpus=5, ignore_reinit_error=True)
register_env("fail", lambda _: FailOnStepEnv())
for fw in framework_iterator(frameworks=()):
pg = PGTrainer(
@@ -214,6 +215,7 @@ class TestRolloutWorker(unittest.TestCase):
"framework": fw,
})
self.assertRaises(Exception, lambda: pg.train())
pg.stop()
def test_callbacks(self):
for fw in framework_iterator(frameworks=("torch", "tf")):
@@ -240,10 +242,9 @@ class TestRolloutWorker(unittest.TestCase):
self.assertGreater(counts["start"], 0)
self.assertGreater(counts["end"], 0)
self.assertGreater(counts["step"], 0)
pg.stop()
def test_query_evaluators(self):
# Allow for Unittest run.
ray.init(num_cpus=5, ignore_reinit_error=True)
register_env("test", lambda _: gym.make("CartPole-v0"))
for fw in framework_iterator(frameworks=("torch", "tf")):
pg = PGTrainer(
@@ -263,6 +264,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(results, [10, 10, 10])
self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)])
self.assertEqual(results3, [[1, 1], [1, 1], [1, 1]])
pg.stop()
def test_reward_clipping(self):
# clipping on
@@ -274,6 +276,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(max(ev.sample()["rewards"]), 1)
result = collect_metrics(ev, [])
self.assertEqual(result["episode_reward_mean"], 1000)
ev.stop()
# clipping off
ev2 = RolloutWorker(
@@ -284,6 +287,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(max(ev2.sample()["rewards"]), 100)
result2 = collect_metrics(ev2, [])
self.assertEqual(result2["episode_reward_mean"], 1000)
ev2.stop()
def test_hard_horizon(self):
ev = RolloutWorker(
@@ -302,6 +306,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(np.argmax(samples["obs"][4]), 0)
# 3 done values.
self.assertEqual(sum(samples["dones"]), 3)
ev.stop()
# A gym env's max_episode_steps is smaller than Trainer's horizon.
ev = RolloutWorker(
@@ -322,6 +327,7 @@ class TestRolloutWorker(unittest.TestCase):
False, False, False, False, False, True, False, False, False,
False, False, True
])
ev.stop()
def test_soft_horizon(self):
ev = RolloutWorker(
@@ -336,10 +342,9 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(len(set(samples["eps_id"])), 3)
# only 1 hard done value
self.assertEqual(sum(samples["dones"]), 1)
ev.stop()
def test_metrics(self):
# Allow for Unittest run.
ray.init(num_cpus=5, ignore_reinit_error=True)
ev = RolloutWorker(
env_creator=lambda _: MockEnv(episode_length=10),
policy=MockPolicy,
@@ -353,6 +358,7 @@ class TestRolloutWorker(unittest.TestCase):
result = collect_metrics(ev, [remote_ev])
self.assertEqual(result["episodes_this_iter"], 20)
self.assertEqual(result["episode_reward_mean"], 10)
ev.stop()
def test_async(self):
ev = RolloutWorker(
@@ -363,6 +369,7 @@ class TestRolloutWorker(unittest.TestCase):
for key in ["obs", "actions", "rewards", "dones", "advantages"]:
self.assertIn(key, batch)
self.assertGreater(batch["advantages"][0], 1)
ev.stop()
def test_auto_vectorization(self):
ev = RolloutWorker(
@@ -386,6 +393,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(env.unwrapped.config.worker_index, 0)
indices.append(env.unwrapped.config.vector_index)
self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7])
ev.stop()
def test_batches_larger_when_vectorized(self):
ev = RolloutWorker(
@@ -401,6 +409,7 @@ class TestRolloutWorker(unittest.TestCase):
batch = ev.sample()
result = collect_metrics(ev, [])
self.assertEqual(result["episodes_this_iter"], 4)
ev.stop()
def test_vector_env_support(self):
ev = RolloutWorker(
@@ -418,6 +427,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(batch.count, 10)
result = collect_metrics(ev, [])
self.assertEqual(result["episodes_this_iter"], 8)
ev.stop()
def test_truncate_episodes(self):
ev = RolloutWorker(
@@ -427,6 +437,7 @@ class TestRolloutWorker(unittest.TestCase):
batch_mode="truncate_episodes")
batch = ev.sample()
self.assertEqual(batch.count, 15)
ev.stop()
def test_complete_episodes(self):
ev = RolloutWorker(
@@ -436,6 +447,7 @@ class TestRolloutWorker(unittest.TestCase):
batch_mode="complete_episodes")
batch = ev.sample()
self.assertEqual(batch.count, 10)
ev.stop()
def test_complete_episodes_packing(self):
ev = RolloutWorker(
@@ -448,6 +460,7 @@ class TestRolloutWorker(unittest.TestCase):
self.assertEqual(
batch["t"].tolist(),
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
ev.stop()
def test_filter_sync(self):
ev = RolloutWorker(
@@ -461,6 +474,7 @@ class TestRolloutWorker(unittest.TestCase):
obs_f = filters[DEFAULT_POLICY_ID]
self.assertNotEqual(obs_f.rs.n, 0)
self.assertNotEqual(obs_f.buffer.n, 0)
ev.stop()
def test_get_filters(self):
ev = RolloutWorker(
@@ -476,6 +490,7 @@ class TestRolloutWorker(unittest.TestCase):
obs_f2 = filters2[DEFAULT_POLICY_ID]
self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
ev.stop()
def test_sync_filter(self):
ev = RolloutWorker(
@@ -498,6 +513,23 @@ class TestRolloutWorker(unittest.TestCase):
obs_f = filters[DEFAULT_POLICY_ID]
self.assertGreaterEqual(obs_f.rs.n, 100)
self.assertLessEqual(obs_f.buffer.n, 20)
ev.stop()
def test_extra_python_envs(self):
extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"}
self.assertFalse("env_key_1" in os.environ)
self.assertFalse("env_key_2" in os.environ)
ev = RolloutWorker(
env_creator=lambda _: MockEnv(10),
policy=MockPolicy,
extra_python_environs=extra_envs)
self.assertTrue("env_key_1" in os.environ)
self.assertTrue("env_key_2" in os.environ)
ev.stop()
# reset to original
del os.environ["env_key_1"]
del os.environ["env_key_2"]
def sample_and_flush(self, ev):
time.sleep(2)
@@ -508,21 +540,6 @@ class TestRolloutWorker(unittest.TestCase):
self.assertNotEqual(obs_f.buffer.n, 0)
return obs_f
def test_extra_python_envs(self):
extra_envs = {"env_key_1": "env_value_1", "env_key_2": "env_value_2"}
self.assertFalse("env_key_1" in os.environ)
self.assertFalse("env_key_2" in os.environ)
RolloutWorker(
env_creator=lambda _: MockEnv(10),
policy=MockPolicy,
extra_python_environs=extra_envs)
self.assertTrue("env_key_1" in os.environ)
self.assertTrue("env_key_2" in os.environ)
# reset to original
del os.environ["env_key_1"]
del os.environ["env_key_2"]
if __name__ == "__main__":
import pytest
+1 -1
View File
@@ -305,7 +305,7 @@ class ParameterNoise(Exploration):
added_noises.append(
tf.assign(
noise,
tf.random_normal(
tf.random.normal(
shape=noise.shape,
stddev=self.stddev,
dtype=tf.float32)))
+1 -1
View File
@@ -13,7 +13,7 @@ def huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
tf.math.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
def reduce_mean_ignore_inf(x, axis):
+4 -1
View File
@@ -1,4 +1,4 @@
from typing import Any, Dict, Union, Tuple
from typing import Any, Dict, List, Tuple, Union
import gym
# Represents a fully filled out config of a Trainer class.
@@ -77,3 +77,6 @@ TensorType = Any
# Either a plain tensor, or a dict or tuple of tensors (or StructTensors).
TensorStructType = Union[TensorType, dict, tuple]
# A shape of a tensor.
TensorShape = Union[Tuple[int], List[int]]