mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 01:46:10 +08:00
[rllib] Use SGD optimizer for ARS (#2916)
This commit is contained in:
committed by
Eric Liang
parent
1d9652abf1
commit
1943ae44da
@@ -28,13 +28,14 @@ Result = namedtuple("Result", [
|
||||
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
'noise_stdev': 0.02, # std deviation of parameter noise
|
||||
'num_deltas': 32, # number of perturbations to try
|
||||
'deltas_used': 32, # number of perturbations to keep in gradient estimate
|
||||
'num_rollouts': 32, # number of perturbs to try
|
||||
'rollouts_used': 32, # number of perturbs to keep in gradient estimate
|
||||
'num_workers': 2,
|
||||
'stepsize': 0.01, # sgd step-size
|
||||
'sgd_stepsize': 0.01, # sgd step-size
|
||||
'observation_filter': "MeanStdFilter",
|
||||
'noise_size': 250000000,
|
||||
'eval_prob': 0.03, # probability of evaluating the parameter rewards
|
||||
'report_length': 10, # how many of the last rewards we average over
|
||||
'env_config': {},
|
||||
'offset': 0,
|
||||
'policy_type': "LinearPolicy", # ["LinearPolicy", "MLPPolicy"]
|
||||
@@ -180,10 +181,12 @@ class ARSAgent(Agent):
|
||||
self.sess, env.action_space, preprocessor,
|
||||
self.config["observation_filter"],
|
||||
self.config["fcnet_hiddens"], **policy_params)
|
||||
self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
|
||||
self.optimizer = optimizers.SGD(self.policy,
|
||||
self.config["sgd_stepsize"])
|
||||
|
||||
self.deltas_used = self.config["deltas_used"]
|
||||
self.num_deltas = self.config["num_deltas"]
|
||||
self.rollouts_used = self.config["rollouts_used"]
|
||||
self.num_rollouts = self.config["num_rollouts"]
|
||||
self.report_length = self.config["report_length"]
|
||||
|
||||
# Create the shared noise table.
|
||||
print("Creating shared noise table.")
|
||||
@@ -199,6 +202,7 @@ class ARSAgent(Agent):
|
||||
|
||||
self.episodes_so_far = 0
|
||||
self.timesteps_so_far = 0
|
||||
self.reward_list = []
|
||||
self.tstart = time.time()
|
||||
|
||||
def _collect_results(self, theta_id, min_episodes):
|
||||
@@ -233,7 +237,7 @@ class ARSAgent(Agent):
|
||||
# Use the actors to do rollouts, note that we pass in the ID of the
|
||||
# policy weights.
|
||||
results, num_episodes, num_timesteps = self._collect_results(
|
||||
theta_id, config["num_deltas"])
|
||||
theta_id, config["num_rollouts"])
|
||||
|
||||
all_noise_indices = []
|
||||
all_training_returns = []
|
||||
@@ -265,12 +269,12 @@ class ARSAgent(Agent):
|
||||
noisy_lengths = np.array(all_training_lengths)
|
||||
|
||||
# keep only the best returns
|
||||
# select top performing directions if deltas_used < num_deltas
|
||||
# select top performing directions if rollouts_used < num_rollouts
|
||||
max_rewards = np.max(noisy_returns, axis=1)
|
||||
if self.deltas_used > self.num_deltas:
|
||||
self.deltas_used = self.num_deltas
|
||||
if self.rollouts_used > self.num_rollouts:
|
||||
self.rollouts_used = self.num_rollouts
|
||||
|
||||
percentile = 100 * (1 - (self.deltas_used / self.num_deltas))
|
||||
percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
|
||||
idx = np.arange(max_rewards.size)[
|
||||
max_rewards >= np.percentile(max_rewards, percentile)]
|
||||
noise_idx = noise_indices[idx]
|
||||
@@ -293,11 +297,11 @@ class ARSAgent(Agent):
|
||||
theta, update_ratio = self.optimizer.update(-g)
|
||||
# Set the new weights in the local copy of the policy.
|
||||
self.policy.set_weights(theta)
|
||||
# update the reward list
|
||||
if len(all_eval_returns) > 0:
|
||||
self.reward_list.append(eval_returns.mean())
|
||||
|
||||
step_tend = time.time()
|
||||
tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
|
||||
tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
|
||||
tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())
|
||||
|
||||
tlogger.record_tabular("NoisyEpRewMean", noisy_returns.mean())
|
||||
tlogger.record_tabular("NoisyEpRewStd", noisy_returns.std())
|
||||
@@ -319,9 +323,9 @@ class ARSAgent(Agent):
|
||||
"time_elapsed_this_iter": step_tend - step_tstart,
|
||||
"time_elapsed": step_tend - self.tstart
|
||||
}
|
||||
|
||||
result = dict(
|
||||
episode_reward_mean=eval_returns.mean(),
|
||||
episode_reward_mean=np.mean(
|
||||
self.reward_list[-self.report_length:]),
|
||||
episode_len_mean=eval_lengths.mean(),
|
||||
timesteps_this_iter=noisy_lengths.sum(),
|
||||
info=info)
|
||||
|
||||
@@ -9,15 +9,15 @@ import numpy as np
|
||||
|
||||
|
||||
class Optimizer(object):
|
||||
def __init__(self, pi):
|
||||
self.pi = pi
|
||||
self.dim = pi.num_params
|
||||
def __init__(self, policy):
|
||||
self.policy = policy
|
||||
self.dim = policy.num_params
|
||||
self.t = 0
|
||||
|
||||
def update(self, globalg):
|
||||
self.t += 1
|
||||
step = self._compute_step(globalg)
|
||||
theta = self.pi.get_weights()
|
||||
theta = self.policy.get_weights()
|
||||
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
|
||||
return theta + step, ratio
|
||||
|
||||
@@ -26,8 +26,8 @@ class Optimizer(object):
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
def __init__(self, pi, stepsize, momentum=0.9):
|
||||
Optimizer.__init__(self, pi)
|
||||
def __init__(self, policy, stepsize, momentum=0.0):
|
||||
Optimizer.__init__(self, policy)
|
||||
self.v = np.zeros(self.dim, dtype=np.float32)
|
||||
self.stepsize, self.momentum = stepsize, momentum
|
||||
|
||||
@@ -38,8 +38,9 @@ class SGD(Optimizer):
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
|
||||
Optimizer.__init__(self, pi)
|
||||
def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999,
|
||||
epsilon=1e-08):
|
||||
Optimizer.__init__(self, policy)
|
||||
self.stepsize = stepsize
|
||||
self.beta1 = beta1
|
||||
self.beta2 = beta2
|
||||
|
||||
@@ -37,6 +37,7 @@ DEFAULT_CONFIG = {
|
||||
"stepsize": 0.01,
|
||||
"observation_filter": "MeanStdFilter",
|
||||
"noise_size": 250000000,
|
||||
"report_length": 10,
|
||||
"env": None,
|
||||
"env_config": {},
|
||||
}
|
||||
@@ -164,6 +165,7 @@ class ESAgent(Agent):
|
||||
self.sess, env.action_space, preprocessor,
|
||||
self.config["observation_filter"], **policy_params)
|
||||
self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
|
||||
self.report_length = self.config["report_length"]
|
||||
|
||||
# Create the shared noise table.
|
||||
print("Creating shared noise table.")
|
||||
@@ -179,6 +181,7 @@ class ESAgent(Agent):
|
||||
|
||||
self.episodes_so_far = 0
|
||||
self.timesteps_so_far = 0
|
||||
self.reward_list = []
|
||||
self.tstart = time.time()
|
||||
|
||||
def _collect_results(self, theta_id, min_episodes, min_timesteps):
|
||||
@@ -264,9 +267,11 @@ class ESAgent(Agent):
|
||||
config["l2_coeff"] * theta)
|
||||
# Set the new weights in the local copy of the policy.
|
||||
self.policy.set_weights(theta)
|
||||
# Store the rewards
|
||||
if len(all_eval_returns) > 0:
|
||||
self.reward_list.append(np.mean(eval_returns))
|
||||
|
||||
step_tend = time.time()
|
||||
tlogger.record_tabular("EvalEpRewMean", eval_returns.mean())
|
||||
tlogger.record_tabular("EvalEpRewStd", eval_returns.std())
|
||||
tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean())
|
||||
|
||||
@@ -299,8 +304,9 @@ class ESAgent(Agent):
|
||||
"time_elapsed": step_tend - self.tstart
|
||||
}
|
||||
|
||||
reward_mean = np.mean(self.reward_list[-self.report_length:])
|
||||
result = dict(
|
||||
episode_reward_mean=eval_returns.mean(),
|
||||
episode_reward_mean=reward_mean,
|
||||
episode_len_mean=eval_lengths.mean(),
|
||||
timesteps_this_iter=noisy_lengths.sum(),
|
||||
info=info)
|
||||
|
||||
@@ -122,8 +122,8 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
"ARS", {
|
||||
"num_workers": 1,
|
||||
"noise_size": 10000000,
|
||||
"num_deltas": 1,
|
||||
"deltas_used": 1
|
||||
"num_rollouts": 1,
|
||||
"rollouts_used": 1
|
||||
}, stats)
|
||||
check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
|
||||
num_unexpected_errors = 0
|
||||
|
||||
@@ -6,10 +6,10 @@ cartpole-ars:
|
||||
time_total_s: 600
|
||||
config:
|
||||
noise_stdev: 0.02
|
||||
num_deltas: 50
|
||||
deltas_used: 25
|
||||
num_rollouts: 50
|
||||
rollouts_used: 25
|
||||
num_workers: 2
|
||||
stepsize: 0.01
|
||||
sgd_stepsize: 0.01
|
||||
noise_size: 250000000
|
||||
eval_prob: 0.5
|
||||
policy_type: MLPPolicy
|
||||
|
||||
@@ -1,15 +1,16 @@
|
||||
# can expect improvement to -140 reward in ~300-500k timesteps
|
||||
pendulum-ars:
|
||||
swimmer-ars:
|
||||
env: Swimmer-v2
|
||||
run: ARS
|
||||
config:
|
||||
noise_stdev: 0.01
|
||||
num_deltas: 2
|
||||
deltas_used: 1
|
||||
num_rollouts: 1
|
||||
rollouts_used: 1
|
||||
num_workers: 1
|
||||
stepsize: 0.02
|
||||
sgd_stepsize: 0.02
|
||||
noise_size: 250000000
|
||||
fcnet_hiddens: [32,32]
|
||||
policy_type: LinearPolicy
|
||||
eval_prob: 0.2
|
||||
offset: 0
|
||||
observation_filter: NoFilter
|
||||
report_length: 3
|
||||
|
||||
Reference in New Issue
Block a user