diff --git a/python/ray/rllib/evolution_strategies/evolution_strategies.py b/python/ray/rllib/evolution_strategies/evolution_strategies.py index f56944091..21062dc19 100644 --- a/python/ray/rllib/evolution_strategies/evolution_strategies.py +++ b/python/ray/rllib/evolution_strategies/evolution_strategies.py @@ -30,8 +30,8 @@ Result = namedtuple("Result", [ DEFAULT_CONFIG = dict( l2coeff=0.005, noise_stdev=0.02, - episodes_per_batch=10000, - timesteps_per_batch=100000, + episodes_per_batch=1000, + timesteps_per_batch=10000, calc_obstat_prob=0.01, eval_prob=0, snapshot_freq=0, @@ -188,6 +188,25 @@ class EvolutionStrategies(Algorithm): self.tstart = time.time() self.iteration = 0 + def _collect_results(self, theta_id, min_eps, min_timesteps): + num_eps, num_timesteps = 0, 0 + results = [] + while num_eps < min_eps or num_timesteps < min_timesteps: + print( + "Collected {} episodes {} timesteps so far this iter".format( + num_eps, num_timesteps)) + rollout_ids = [worker.do_rollouts.remote( + theta_id, + self.ob_stat.mean if self.policy.needs_ob_stat else None, + self.ob_stat.std if self.policy.needs_ob_stat else None) + for worker in self.workers] + # Get the results of the rollouts. + for result in ray.get(rollout_ids): + results.append(result) + num_eps += result.lengths_n2.size + num_timesteps += result.lengths_n2.sum() + return results + def train(self): config = self.config @@ -199,14 +218,10 @@ class EvolutionStrategies(Algorithm): theta_id = ray.put(theta) # Use the actors to do rollouts, note that we pass in the ID of the # policy weights. - rollout_ids = [worker.do_rollouts.remote( + results = self._collect_results( theta_id, - self.ob_stat.mean if self.policy.needs_ob_stat else None, - self.ob_stat.std if self.policy.needs_ob_stat else None) - for worker in self.workers] - - # Get the results of the rollouts. - results = ray.get(rollout_ids) + config["episodes_per_batch"], + config["timesteps_per_batch"]) curr_task_results = [] ob_count_this_batch = 0