diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index 8ce7e66ff..575900776 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -274,9 +274,17 @@ Soft Actor Critic (SAC) SAC architecture (same as DQN) -RLlib's soft-actor critic implementation is ported from the `official SAC repo `__ to better integrate with RLlib APIs. Note that SAC has two fields to configure for custom models: ``policy_model`` and ``Q_model``, and currently has no support for non-continuous action distributions. It is also currently *experimental*. +RLlib's soft-actor critic implementation is ported from the `official SAC repo `__ to better integrate with RLlib APIs. Note that SAC has two fields to configure for custom models: ``policy_model`` and ``Q_model``, and currently has no support for non-continuous action distributions. -Tuned examples: `Pendulum-v0 `__ +Tuned examples: `Pendulum-v0 `__, `HalfCheetah-v3 `__ + +**MuJoCo results @500k steps:** `more details `__ + +============= ========== =================== +MuJoCo env RLlib SAC Haarnoja et al SAC +============= ========== =================== +HalfCheetah 8752 ~9000 +============= ========== =================== **SAC-specific configs** (see also `common configs `__): diff --git a/rllib/agents/sac/sac_model.py b/rllib/agents/sac/sac_model.py index d72a6063f..93787b535 100644 --- a/rllib/agents/sac/sac_model.py +++ b/rllib/agents/sac/sac_model.py @@ -81,7 +81,6 @@ class SACModel(TFModelV2): shape=(num_outputs, ), name="model_out") self.actions = tf.keras.layers.Input( shape=(self.action_dim, ), name="actions") - shift_and_log_scale_diag = tf.keras.Sequential([ tf.keras.layers.Dense( units=hidden, @@ -90,10 +89,7 @@ class SACModel(TFModelV2): for i, hidden in enumerate(actor_hiddens) ] + [ tf.keras.layers.Dense( - units=tfp.layers.MultivariateNormalTriL.params_size( - self.action_dim), - activation=None, - name="action_out") + units=2 * self.action_dim, activation=None, name="action_out") ])(self.model_out) shift, log_scale_diag = tf.keras.layers.Lambda( diff --git a/rllib/tuned_examples/halfcheetah-sac.yaml b/rllib/tuned_examples/halfcheetah-sac.yaml new file mode 100644 index 000000000..4669d51bc --- /dev/null +++ b/rllib/tuned_examples/halfcheetah-sac.yaml @@ -0,0 +1,37 @@ +# Our implementation of SAC can reach 9k reward in 400k timesteps +halfcheetah_sac: + env: HalfCheetah-v3 + run: SAC + stop: + episode_reward_mean: 9000 + config: + horizon: 1000 + soft_horizon: False + Q_model: + hidden_activation: relu + hidden_layer_sizes: [256, 256] + policy_model: + hidden_activation: relu + hidden_layer_sizes: [256, 256] + tau: 0.005 + target_entropy: auto + no_done_at_end: True + n_step: 1 + sample_batch_size: 1 + prioritized_replay: False + train_batch_size: 256 + target_network_update_freq: 1 + timesteps_per_iteration: 1000 + learning_starts: 10000 + exploration_enabled: True + optimization: + actor_learning_rate: 0.0003 + critic_learning_rate: 0.0003 + entropy_learning_rate: 0.0003 + num_workers: 0 + num_gpus: 0 + clip_actions: False + normalize_actions: True + evaluation_interval: 1 + metrics_smoothing_episodes: 5 +