[rllib] Use SGD optimizer for ARS (#2916)

This commit is contained in:
eugenevinitsky
2018-09-26 22:32:26 -07:00
committed by Eric Liang
parent 1d9652abf1
commit 1943ae44da
6 changed files with 48 additions and 36 deletions
+20 -16
View File
@@ -28,13 +28,14 @@ Result = namedtuple("Result", [
DEFAULT_CONFIG = with_common_config({
'noise_stdev': 0.02, # std deviation of parameter noise
'num_deltas': 32, # number of perturbations to try
'deltas_used': 32, # number of perturbations to keep in gradient estimate
'num_rollouts': 32, # number of perturbs to try
'rollouts_used': 32, # number of perturbs to keep in gradient estimate
'num_workers': 2,
'stepsize': 0.01, # sgd step-size
'sgd_stepsize': 0.01, # sgd step-size
'observation_filter': "MeanStdFilter",
'noise_size': 250000000,
'eval_prob': 0.03, # probability of evaluating the parameter rewards
'report_length': 10, # how many of the last rewards we average over
'env_config': {},
'offset': 0,
'policy_type': "LinearPolicy", # ["LinearPolicy", "MLPPolicy"]
@@ -180,10 +181,12 @@ class ARSAgent(Agent):
self.sess, env.action_space, preprocessor,
self.config["observation_filter"],
self.config["fcnet_hiddens"], **policy_params)
self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
self.optimizer = optimizers.SGD(self.policy,
self.config["sgd_stepsize"])
self.deltas_used = self.config["deltas_used"]
self.num_deltas = self.config["num_deltas"]
self.rollouts_used = self.config["rollouts_used"]
self.num_rollouts = self.config["num_rollouts"]
self.report_length = self.config["report_length"]
# Create the shared noise table.
print("Creating shared noise table.")
@@ -199,6 +202,7 @@ class ARSAgent(Agent):
self.episodes_so_far = 0
self.timesteps_so_far = 0
self.reward_list = []
self.tstart = time.time()
def _collect_results(self, theta_id, min_episodes):
@@ -233,7 +237,7 @@ class ARSAgent(Agent):
# Use the actors to do rollouts, note that we pass in the ID of the
# policy weights.
results, num_episodes, num_timesteps = self._collect_results(
theta_id, config["num_deltas"])
theta_id, config["num_rollouts"])
all_noise_indices = []
all_training_returns = []
@@ -265,12 +269,12 @@ class ARSAgent(Agent):
noisy_lengths = np.array(all_training_lengths)
# keep only the best returns
# select top performing directions if deltas_used < num_deltas
# select top performing directions if rollouts_used < num_rollouts
max_rewards = np.max(noisy_returns, axis=1)
if self.deltas_used > self.num_deltas:
self.deltas_used = self.num_deltas
if self.rollouts_used > self.num_rollouts:
self.rollouts_used = self.num_rollouts
percentile = 100 * (1 - (self.deltas_used / self.num_deltas))
percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
idx = np.arange(max_rewards.size)[
max_rewards >= np.percentile(max_rewards, percentile)]
noise_idx = noise_indices[idx]
@@ -293,11 +297,11 @@ class ARSAgent(Agent):
theta, update_ratio = self.optimizer.update(-g)
# Set the new weights in the local copy of the policy.
self.policy.set_weights(theta)
# update the reward list
if len(all_eval_returns) > 0:
self.reward_list.append(eval_returns.mean())
step_tend = time.time()
tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())
tlogger.record_tabular("NoisyEpRewMean", noisy_returns.mean())
tlogger.record_tabular("NoisyEpRewStd", noisy_returns.std())
@@ -319,9 +323,9 @@ class ARSAgent(Agent):
"time_elapsed_this_iter": step_tend - step_tstart,
"time_elapsed": step_tend - self.tstart
}
result = dict(
episode_reward_mean=eval_returns.mean(),
episode_reward_mean=np.mean(
self.reward_list[-self.report_length:]),
episode_len_mean=eval_lengths.mean(),
timesteps_this_iter=noisy_lengths.sum(),
info=info)
+9 -8
View File
@@ -9,15 +9,15 @@ import numpy as np
class Optimizer(object):
def __init__(self, pi):
self.pi = pi
self.dim = pi.num_params
def __init__(self, policy):
self.policy = policy
self.dim = policy.num_params
self.t = 0
def update(self, globalg):
self.t += 1
step = self._compute_step(globalg)
theta = self.pi.get_weights()
theta = self.policy.get_weights()
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
return theta + step, ratio
@@ -26,8 +26,8 @@ class Optimizer(object):
class SGD(Optimizer):
def __init__(self, pi, stepsize, momentum=0.9):
Optimizer.__init__(self, pi)
def __init__(self, policy, stepsize, momentum=0.0):
Optimizer.__init__(self, policy)
self.v = np.zeros(self.dim, dtype=np.float32)
self.stepsize, self.momentum = stepsize, momentum
@@ -38,8 +38,9 @@ class SGD(Optimizer):
class Adam(Optimizer):
def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
Optimizer.__init__(self, pi)
def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999,
epsilon=1e-08):
Optimizer.__init__(self, policy)
self.stepsize = stepsize
self.beta1 = beta1
self.beta2 = beta2
+8 -2
View File
@@ -37,6 +37,7 @@ DEFAULT_CONFIG = {
"stepsize": 0.01,
"observation_filter": "MeanStdFilter",
"noise_size": 250000000,
"report_length": 10,
"env": None,
"env_config": {},
}
@@ -164,6 +165,7 @@ class ESAgent(Agent):
self.sess, env.action_space, preprocessor,
self.config["observation_filter"], **policy_params)
self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
self.report_length = self.config["report_length"]
# Create the shared noise table.
print("Creating shared noise table.")
@@ -179,6 +181,7 @@ class ESAgent(Agent):
self.episodes_so_far = 0
self.timesteps_so_far = 0
self.reward_list = []
self.tstart = time.time()
def _collect_results(self, theta_id, min_episodes, min_timesteps):
@@ -264,9 +267,11 @@ class ESAgent(Agent):
config["l2_coeff"] * theta)
# Set the new weights in the local copy of the policy.
self.policy.set_weights(theta)
# Store the rewards
if len(all_eval_returns) > 0:
self.reward_list.append(np.mean(eval_returns))
step_tend = time.time()
tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())
@@ -299,8 +304,9 @@ class ESAgent(Agent):
"time_elapsed": step_tend - self.tstart
}
reward_mean = np.mean(self.reward_list[-self.report_length:])
result = dict(
episode_reward_mean=eval_returns.mean(),
episode_reward_mean=reward_mean,
episode_len_mean=eval_lengths.mean(),
timesteps_this_iter=noisy_lengths.sum(),
info=info)
@@ -122,8 +122,8 @@ class ModelSupportedSpaces(unittest.TestCase):
"ARS", {
"num_workers": 1,
"noise_size": 10000000,
"num_deltas": 1,
"deltas_used": 1
"num_rollouts": 1,
"rollouts_used": 1
}, stats)
check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
num_unexpected_errors = 0
@@ -6,10 +6,10 @@ cartpole-ars:
time_total_s: 600
config:
noise_stdev: 0.02
num_deltas: 50
deltas_used: 25
num_rollouts: 50
rollouts_used: 25
num_workers: 2
stepsize: 0.01
sgd_stepsize: 0.01
noise_size: 250000000
eval_prob: 0.5
policy_type: MLPPolicy
@@ -1,15 +1,16 @@
# can expect improvement to -140 reward in ~300-500k timesteps
pendulum-ars:
swimmer-ars:
env: Swimmer-v2
run: ARS
config:
noise_stdev: 0.01
num_deltas: 2
deltas_used: 1
num_rollouts: 1
rollouts_used: 1
num_workers: 1
stepsize: 0.02
sgd_stepsize: 0.02
noise_size: 250000000
fcnet_hiddens: [32,32]
policy_type: LinearPolicy
eval_prob: 0.2
offset: 0
observation_filter: NoFilter
report_length: 3