diff --git a/python/ray/rllib/agents/ars/ars.py b/python/ray/rllib/agents/ars/ars.py index 614620d89..69b85eeec 100644 --- a/python/ray/rllib/agents/ars/ars.py +++ b/python/ray/rllib/agents/ars/ars.py @@ -28,13 +28,14 @@ Result = namedtuple("Result", [ DEFAULT_CONFIG = with_common_config({ 'noise_stdev': 0.02, # std deviation of parameter noise - 'num_deltas': 32, # number of perturbations to try - 'deltas_used': 32, # number of perturbations to keep in gradient estimate + 'num_rollouts': 32, # number of perturbs to try + 'rollouts_used': 32, # number of perturbs to keep in gradient estimate 'num_workers': 2, - 'stepsize': 0.01, # sgd step-size + 'sgd_stepsize': 0.01, # sgd step-size 'observation_filter': "MeanStdFilter", 'noise_size': 250000000, 'eval_prob': 0.03, # probability of evaluating the parameter rewards + 'report_length': 10, # how many of the last rewards we average over 'env_config': {}, 'offset': 0, 'policy_type': "LinearPolicy", # ["LinearPolicy", "MLPPolicy"] @@ -180,10 +181,12 @@ class ARSAgent(Agent): self.sess, env.action_space, preprocessor, self.config["observation_filter"], self.config["fcnet_hiddens"], **policy_params) - self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"]) + self.optimizer = optimizers.SGD(self.policy, + self.config["sgd_stepsize"]) - self.deltas_used = self.config["deltas_used"] - self.num_deltas = self.config["num_deltas"] + self.rollouts_used = self.config["rollouts_used"] + self.num_rollouts = self.config["num_rollouts"] + self.report_length = self.config["report_length"] # Create the shared noise table. print("Creating shared noise table.") @@ -199,6 +202,7 @@ class ARSAgent(Agent): self.episodes_so_far = 0 self.timesteps_so_far = 0 + self.reward_list = [] self.tstart = time.time() def _collect_results(self, theta_id, min_episodes): @@ -233,7 +237,7 @@ class ARSAgent(Agent): # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. results, num_episodes, num_timesteps = self._collect_results( - theta_id, config["num_deltas"]) + theta_id, config["num_rollouts"]) all_noise_indices = [] all_training_returns = [] @@ -265,12 +269,12 @@ class ARSAgent(Agent): noisy_lengths = np.array(all_training_lengths) # keep only the best returns - # select top performing directions if deltas_used < num_deltas + # select top performing directions if rollouts_used < num_rollouts max_rewards = np.max(noisy_returns, axis=1) - if self.deltas_used > self.num_deltas: - self.deltas_used = self.num_deltas + if self.rollouts_used > self.num_rollouts: + self.rollouts_used = self.num_rollouts - percentile = 100 * (1 - (self.deltas_used / self.num_deltas)) + percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts)) idx = np.arange(max_rewards.size)[ max_rewards >= np.percentile(max_rewards, percentile)] noise_idx = noise_indices[idx] @@ -293,11 +297,11 @@ class ARSAgent(Agent): theta, update_ratio = self.optimizer.update(-g) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) + # update the reward list + if len(all_eval_returns) > 0: + self.reward_list.append(eval_returns.mean()) step_tend = time.time() - tlogger.record_tabular("EvalEpRewMean", eval_returns.mean()) - tlogger.record_tabular("EvalEpRewStd", eval_returns.std()) - tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean()) tlogger.record_tabular("NoisyEpRewMean", noisy_returns.mean()) tlogger.record_tabular("NoisyEpRewStd", noisy_returns.std()) @@ -319,9 +323,9 @@ class ARSAgent(Agent): "time_elapsed_this_iter": step_tend - step_tstart, "time_elapsed": step_tend - self.tstart } - result = dict( - episode_reward_mean=eval_returns.mean(), + episode_reward_mean=np.mean( + self.reward_list[-self.report_length:]), episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) diff --git a/python/ray/rllib/agents/ars/optimizers.py b/python/ray/rllib/agents/ars/optimizers.py index 3b48f7393..0e6420bb2 100644 --- a/python/ray/rllib/agents/ars/optimizers.py +++ b/python/ray/rllib/agents/ars/optimizers.py @@ -9,15 +9,15 @@ import numpy as np class Optimizer(object): - def __init__(self, pi): - self.pi = pi - self.dim = pi.num_params + def __init__(self, policy): + self.policy = policy + self.dim = policy.num_params self.t = 0 def update(self, globalg): self.t += 1 step = self._compute_step(globalg) - theta = self.pi.get_weights() + theta = self.policy.get_weights() ratio = np.linalg.norm(step) / np.linalg.norm(theta) return theta + step, ratio @@ -26,8 +26,8 @@ class Optimizer(object): class SGD(Optimizer): - def __init__(self, pi, stepsize, momentum=0.9): - Optimizer.__init__(self, pi) + def __init__(self, policy, stepsize, momentum=0.0): + Optimizer.__init__(self, policy) self.v = np.zeros(self.dim, dtype=np.float32) self.stepsize, self.momentum = stepsize, momentum @@ -38,8 +38,9 @@ class SGD(Optimizer): class Adam(Optimizer): - def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08): - Optimizer.__init__(self, pi) + def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999, + epsilon=1e-08): + Optimizer.__init__(self, policy) self.stepsize = stepsize self.beta1 = beta1 self.beta2 = beta2 diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py index de59137b6..452918f58 100644 --- a/python/ray/rllib/agents/es/es.py +++ b/python/ray/rllib/agents/es/es.py @@ -37,6 +37,7 @@ DEFAULT_CONFIG = { "stepsize": 0.01, "observation_filter": "MeanStdFilter", "noise_size": 250000000, + "report_length": 10, "env": None, "env_config": {}, } @@ -164,6 +165,7 @@ class ESAgent(Agent): self.sess, env.action_space, preprocessor, self.config["observation_filter"], **policy_params) self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"]) + self.report_length = self.config["report_length"] # Create the shared noise table. print("Creating shared noise table.") @@ -179,6 +181,7 @@ class ESAgent(Agent): self.episodes_so_far = 0 self.timesteps_so_far = 0 + self.reward_list = [] self.tstart = time.time() def _collect_results(self, theta_id, min_episodes, min_timesteps): @@ -264,9 +267,11 @@ class ESAgent(Agent): config["l2_coeff"] * theta) # Set the new weights in the local copy of the policy. self.policy.set_weights(theta) + # Store the rewards + if len(all_eval_returns) > 0: + self.reward_list.append(np.mean(eval_returns)) step_tend = time.time() - tlogger.record_tabular("EvalEpRewMean", eval_returns.mean()) tlogger.record_tabular("EvalEpRewStd", eval_returns.std()) tlogger.record_tabular("EvalEpLenMean", eval_lengths.mean()) @@ -299,8 +304,9 @@ class ESAgent(Agent): "time_elapsed": step_tend - self.tstart } + reward_mean = np.mean(self.reward_list[-self.report_length:]) result = dict( - episode_reward_mean=eval_returns.mean(), + episode_reward_mean=reward_mean, episode_len_mean=eval_lengths.mean(), timesteps_this_iter=noisy_lengths.sum(), info=info) diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index dd81c1853..2ced3402a 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -122,8 +122,8 @@ class ModelSupportedSpaces(unittest.TestCase): "ARS", { "num_workers": 1, "noise_size": 10000000, - "num_deltas": 1, - "deltas_used": 1 + "num_rollouts": 1, + "rollouts_used": 1 }, stats) check_support("PG", {"num_workers": 1, "optimizer": {}}, stats) num_unexpected_errors = 0 diff --git a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml index 95050aac8..550170c2e 100644 --- a/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml +++ b/python/ray/rllib/tuned_examples/regression_tests/cartpole-ars.yaml @@ -6,10 +6,10 @@ cartpole-ars: time_total_s: 600 config: noise_stdev: 0.02 - num_deltas: 50 - deltas_used: 25 + num_rollouts: 50 + rollouts_used: 25 num_workers: 2 - stepsize: 0.01 + sgd_stepsize: 0.01 noise_size: 250000000 eval_prob: 0.5 policy_type: MLPPolicy diff --git a/python/ray/rllib/tuned_examples/swimmer-ars.yaml b/python/ray/rllib/tuned_examples/swimmer-ars.yaml index db34c46fe..338c8a12c 100644 --- a/python/ray/rllib/tuned_examples/swimmer-ars.yaml +++ b/python/ray/rllib/tuned_examples/swimmer-ars.yaml @@ -1,15 +1,16 @@ # can expect improvement to -140 reward in ~300-500k timesteps -pendulum-ars: +swimmer-ars: env: Swimmer-v2 run: ARS config: noise_stdev: 0.01 - num_deltas: 2 - deltas_used: 1 + num_rollouts: 1 + rollouts_used: 1 num_workers: 1 - stepsize: 0.02 + sgd_stepsize: 0.02 noise_size: 250000000 - fcnet_hiddens: [32,32] policy_type: LinearPolicy eval_prob: 0.2 offset: 0 + observation_filter: NoFilter + report_length: 3