diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index 91dfbfe4d..ed7f8d89b 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -29,26 +29,17 @@ DEFAULT_CONFIG = with_common_config({ "normalize_actions": True, # === Learning === - # Update the target by \tau * policy + (1-\tau) * target_policy + # Update the target by \tau * policy + (1-\tau) * target_policy. "tau": 5e-3, # Target entropy lower bound. This is the inverse of reward scale, # and will be optimized automatically. "target_entropy": "auto", # Disable setting done=True at end of episode. "no_done_at_end": True, - # N-step target updates + # N-step target updates. "n_step": 1, - # === Evaluation === - # The evaluation stats will be reported under the "evaluation" metric key. - "evaluation_interval": 1, - # Number of episodes to run per evaluation period. - "evaluation_num_episodes": 1, - # Extra configuration that disables exploration. - "evaluation_config": { - "explore": False, - }, - # Number of env steps to optimize for before returning + # Number of env steps to optimize for before returning. "timesteps_per_iteration": 100, # === Replay buffer === diff --git a/rllib/evaluation/sample_batch_builder.py b/rllib/evaluation/sample_batch_builder.py index 5dad3fb8c..073b61f0e 100644 --- a/rllib/evaluation/sample_batch_builder.py +++ b/rllib/evaluation/sample_batch_builder.py @@ -95,9 +95,9 @@ class MultiAgentSampleBatchBuilder: def total(self): """Returns summed number of steps across all agent buffers.""" - return sum(p.count for p in self.policy_builders.values()) + return sum(a.count for a in self.agent_builders.values()) - def has_pending_data(self): + def has_pending_agent_data(self): """Returns whether there is pending unprocessed data.""" return len(self.agent_builders) > 0 diff --git a/rllib/evaluation/sampler.py b/rllib/evaluation/sampler.py index c4eb38f1b..0910a5ec8 100644 --- a/rllib/evaluation/sampler.py +++ b/rllib/evaluation/sampler.py @@ -266,7 +266,7 @@ def _env_runner(base_env, extra_batch_callback, policies, policy_mapping_fn, if not horizon: horizon = (base_env.get_unwrapped()[0].spec.max_episode_steps) except Exception: - logger.debug("no episode horizon specified, assuming inf") + logger.debug("No episode horizon specified, assuming inf.") if not horizon: horizon = float("inf") @@ -354,6 +354,8 @@ def _process_observations(base_env, policies, batch_builder_pool, active_envs = set() to_eval = defaultdict(list) outputs = [] + large_batch_threshold = max(1000, unroll_length * 10) if \ + unroll_length != float("inf") else 5000 # For each environment for env_id, agent_obs in unfiltered_obs.items(): @@ -364,18 +366,21 @@ def _process_observations(base_env, policies, batch_builder_pool, episode.batch_builder.count += 1 episode._add_agent_rewards(rewards[env_id]) - if (episode.batch_builder.total() > max(1000, unroll_length * 10) + if (episode.batch_builder.total() > large_batch_threshold and log_once("large_batch_warning")): logger.warning( "More than {} observations for {} env steps ".format( episode.batch_builder.total(), episode.batch_builder.count) + "are buffered in " "the sampler. If this is more than you expected, check that " - "that you set a horizon on your environment correctly. Note " - "that in multi-agent environments, `sample_batch_size` sets " + "that you set a horizon on your environment correctly and that" + " it terminates at some point. " + "Note: In multi-agent environments, `sample_batch_size` sets " "the batch size based on environment steps, not the steps of " "individual agents, which can result in unexpectedly large " - "batches.") + "batches. Also, you may be in evaluation waiting for your Env " + "to terminate (batch_mode=`complete_episodes`). Make sure it " + "does at some point.") # Check episode termination conditions if dones[env_id]["__all__"] or episode.length >= horizon: @@ -398,7 +403,7 @@ def _process_observations(base_env, policies, batch_builder_pool, all_done = False active_envs.add(env_id) - # For each agent in the environment + # For each agent in the environment. for agent_id, raw_obs in agent_obs.items(): policy_id = episode.policy_for(agent_id) prep_obs = _get_or_raise(preprocessors, @@ -451,7 +456,7 @@ def _process_observations(base_env, policies, batch_builder_pool, # Cut the batch if we're not packing multiple episodes into one, # or if we've exceeded the requested batch size. - if episode.batch_builder.has_pending_data(): + if episode.batch_builder.has_pending_agent_data(): if dones[env_id]["__all__"] and not no_done_at_end: episode.batch_builder.check_missing_dones() if (all_done and not pack) or \ diff --git a/rllib/tuned_examples/regression_tests/pendulum-sac.yaml b/rllib/tuned_examples/regression_tests/pendulum-sac.yaml index 47404fb6e..94dee065a 100644 --- a/rllib/tuned_examples/regression_tests/pendulum-sac.yaml +++ b/rllib/tuned_examples/regression_tests/pendulum-sac.yaml @@ -5,7 +5,6 @@ pendulum-sac: episode_reward_mean: -300 # note that evaluation perf is higher timesteps_total: 10000 config: - evaluation_interval: 1 # logged under evaluation/* metric keys soft_horizon: True clip_actions: False normalize_actions: True