[RLlib] Fix erroneous use of LinearSchedule (in DDPG's exploration annealing). (#7125)

* Fix erroneous use of LinearSchedule (in DDPG's exploration annealing).
Erase schedules_obsoleted.py.

* Trigger re-test.

* Re-test.
This commit is contained in:
Sven Mika
2020-02-13 08:46:49 +01:00
committed by GitHub
parent 9fc3e2e50f
commit 5518a738b3
3 changed files with 7 additions and 112 deletions
+1 -1
View File
@@ -380,7 +380,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/rllib/tests/multiagent_pendulum.py
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/rllib/examples/multiagent_cartpole.py --num-iters=2
@@ -493,3 +492,4 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
/ray/ci/suppress_output python /ray/rllib/examples/random_env.py
+6 -6
View File
@@ -1,7 +1,7 @@
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
from ray.rllib.utils.schedules import ConstantSchedule, PiecewiseSchedule
# yapf: disable
# __sphinx_doc_begin__
@@ -176,11 +176,11 @@ def make_exploration_schedule(config, worker_index):
# run properly
return ConstantSchedule(0.0)
elif config["exploration_should_anneal"]:
return LinearSchedule(
schedule_timesteps=int(config["exploration_fraction"] *
config["schedule_max_timesteps"]),
initial_p=1.0,
final_p=config["exploration_final_scale"])
return PiecewiseSchedule(
endpoints=[(0, 1.0), (int(config["exploration_fraction"] *
config["schedule_max_timesteps"]),
config["exploration_final_scale"])],
outside_value=config["exploration_final_scale"])
else:
# *always* add exploration noise
return ConstantSchedule(1.0)
-105
View File
@@ -1,105 +0,0 @@
"""This file is used for specifying various schedules that evolve over
time throughout the execution of the algorithm, such as:
- learning rate for the optimizer
- exploration epsilon for the epsilon greedy exploration strategy
- beta parameter for beta parameter in prioritized replay
Each schedule has a function `value(t)` which returns the current value
of the parameter given the timestep t of the optimization procedure.
"""
class Schedule:
def value(self, t):
"""Value of the schedule at time t"""
raise NotImplementedError()
class ConstantSchedule:
def __init__(self, value):
"""Value remains constant over time.
Parameters
----------
value: float
Constant value of the schedule
"""
self._v = value
def value(self, t):
"""See Schedule.value"""
return self._v
def linear_interpolation(l, r, alpha):
return l + alpha * (r - l)
class PiecewiseSchedule:
def __init__(self,
endpoints,
interpolation=linear_interpolation,
outside_value=None):
"""Piecewise schedule.
endpoints: [(int, int)]
list of pairs `(time, value)` meanining that schedule should output
`value` when `t==time`. All the values for time must be sorted in
an increasing order. When t is between two times, e.g.
`(time_a, value_a)`
and `(time_b, value_b)`, such that `time_a <= t < time_b` then value
outputs `interpolation(value_a, value_b, alpha)` where alpha is a
fraction of time passed between `time_a` and `time_b` for time `t`.
interpolation: lambda float, float, float: float
a function that takes value to the left and to the right of t
according to the `endpoints`. Alpha is the fraction of distance from
left endpoint to right endpoint that t has covered. See
linear_interpolation for example.
outside_value: float
if the value is requested outside of all the intervals sepecified in
`endpoints` this value is returned. If None then AssertionError is
raised when outside value is requested.
"""
idxes = [e[0] for e in endpoints]
assert idxes == sorted(idxes)
self._interpolation = interpolation
self._outside_value = outside_value
self._endpoints = endpoints
def value(self, t):
"""See Schedule.value"""
for (l_t, l), (r_t, r) in zip(self._endpoints[:-1],
self._endpoints[1:]):
if l_t <= t and t < r_t:
alpha = float(t - l_t) / (r_t - l_t)
return self._interpolation(l, r, alpha)
# t does not belong to any of the pieces, so doom.
assert self._outside_value is not None
return self._outside_value
class LinearSchedule:
def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
"""Linear interpolation between initial_p and final_p over
schedule_timesteps. After this many timesteps pass final_p is
returned.
Parameters
----------
schedule_timesteps: int
Number of timesteps for which to linearly anneal initial_p
to final_p
initial_p: float
initial output value
final_p: float
final output value
"""
self.schedule_timesteps = schedule_timesteps
self.final_p = final_p
self.initial_p = initial_p
def value(self, t):
"""See Schedule.value"""
fraction = min(float(t) / max(1, self.schedule_timesteps), 1.0)
return self.initial_p + fraction * (self.final_p - self.initial_p)