diff --git a/doc/source/rllib-toc.rst b/doc/source/rllib-toc.rst index b52c7e13c..354118325 100644 --- a/doc/source/rllib-toc.rst +++ b/doc/source/rllib-toc.rst @@ -33,7 +33,7 @@ Training APIs - `Callbacks and Custom Metrics `__ - - `Customized Exploration Behavior (Training and Evaluation) `__ + - `Customizing Exploration Behavior `__ - `Customized Evaluation During Training `__ diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index a90e7fd82..a4d4f1e63 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -520,8 +520,8 @@ Custom metrics can be accessed and visualized like any other training result: .. image:: custom_metric.png -Customized Exploration Behavior (Training and Evaluation) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Customizing Exploration Behavior +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RLlib offers a unified top-level API to configure and customize an agent’s exploration behavior, including the decisions (how and whether) to sample @@ -665,9 +665,9 @@ Customized Evaluation During Training RLlib will report online training rewards, however in some cases you may want to compute rewards with different settings (e.g., with exploration turned off, or on a specific set -of environment configurations). You can evaluate policies during training by setting one -or more of the ``evaluation_interval``, ``evaluation_num_episodes``, ``evaluation_config``, -``evaluation_num_workers``, and ``custom_eval_function`` configs +of environment configurations). You can evaluate policies during training by setting +the ``evaluation_interval`` config, and optionally also ``evaluation_num_episodes``, +``evaluation_config``, ``evaluation_num_workers``, and ``custom_eval_function`` (see `trainer.py `__ for further documentation). By default, exploration is left as-is within ``evaluation_config``. @@ -682,9 +682,11 @@ via: "explore": False } -**IMPORTANT NOTE**: Policy gradient algorithms are able to find the optimal -policy, even if this is a stochastic one. Setting "explore=False" above -will result in the evaluation workers not using this optimal policy. +.. note:: + + Policy gradient algorithms are able to find the optimal + policy, even if this is a stochastic one. Setting "explore=False" above + will result in the evaluation workers not using this stochastic policy. There is an end to end example of how to set up custom online evaluation in `custom_eval.py `__. Note that if you only want to eval your policy at the end of training, you can set ``evaluation_interval: N``, where ``N`` is the number of training iterations before stopping. diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index d4f54c5d8..70d5f4f96 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -29,6 +29,9 @@ DEFAULT_CONFIG = with_common_config({ "normalize_actions": True, # === Learning === + # Disable setting done=True at end of episode. This should be set to True + # for infinite-horizon MDPs (e.g., many continuous control problems). + "no_done_at_end": False, # Update the target by \tau * policy + (1-\tau) * target_policy. "tau": 5e-3, # Initial value to use for the entropy weight alpha. @@ -37,8 +40,6 @@ DEFAULT_CONFIG = with_common_config({ # Discrete(2), -3.0 for Box(shape=(3,))). # This is the inverse of reward scale, and will be optimized automatically. "target_entropy": "auto", - # Disable setting done=True at end of episode. - "no_done_at_end": True, # N-step target updates. "n_step": 1, diff --git a/rllib/tuned_examples/regression_tests/pendulum-sac.yaml b/rllib/tuned_examples/regression_tests/pendulum-sac.yaml index 94dee065a..8f2a17050 100644 --- a/rllib/tuned_examples/regression_tests/pendulum-sac.yaml +++ b/rllib/tuned_examples/regression_tests/pendulum-sac.yaml @@ -9,3 +9,4 @@ pendulum-sac: clip_actions: False normalize_actions: True metrics_smoothing_episodes: 5 + no_done_at_end: True