diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index 20f19c6fd..738c4e9ac 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -89,8 +89,10 @@ class ActionNetwork(object): exploration_value = tf.assign_add( exploration_sample, theta * (.0 - exploration_sample) + sigma * normal_sample) - stochastic_actions = deterministic_actions + eps * ( - high_action - low_action) * exploration_value + stochastic_actions = tf.clip_by_value( + deterministic_actions + + eps * (high_action - low_action) * exploration_value, + low_action, high_action) self.actions = tf.cond(stochastic, lambda: stochastic_actions, lambda: deterministic_actions) diff --git a/python/ray/rllib/agents/pg/pg_policy_graph.py b/python/ray/rllib/agents/pg/pg_policy_graph.py index 178cf29e6..8cbb3a588 100644 --- a/python/ray/rllib/agents/pg/pg_policy_graph.py +++ b/python/ray/rllib/agents/pg/pg_policy_graph.py @@ -78,7 +78,7 @@ class PGPolicyGraph(TFPolicyGraph): sample_batch, other_agent_batches=None, episode=None): - # This ads the "advantages" column to the sample batch + # This adds the "advantages" column to the sample batch return compute_advantages( sample_batch, 0.0, self.config["gamma"], use_gae=False) diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index 91b8d2fce..75a43deeb 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -102,9 +102,9 @@ class DiagGaussian(ActionDistribution): self.low = low self.high = high - # Squash to range if specified. - # TODO(ekl) might make sense to use a beta distribution instead: - # http://proceedings.mlr.press/v70/chou17a/chou17a.pdf + # Squash to range if specified. We use a sigmoid here this to avoid the + # mean drifting too far past the bounds and causing nan outputs. + # https://github.com/ray-project/ray/issues/1862 if low is not None: self.mean = low + tf.sigmoid(self.mean) * (high - low) diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index 9f9575200..b98a006bc 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -112,7 +112,13 @@ class ModelSupportedSpaces(unittest.TestCase): def testAll(self): stats = {} check_support("IMPALA", {"num_gpus": 0}, stats) - check_support("DDPG", {"timesteps_per_iteration": 1}, stats) + check_support( + "DDPG", { + "noise_scale": 100.0, + "timesteps_per_iteration": 1 + }, + stats, + check_bounds=True) check_support("DQN", {"timesteps_per_iteration": 1}, stats) check_support("A3C", { "num_workers": 1,