From 663e92ab3f010491ef7420480eb6be7f57a0485c Mon Sep 17 00:00:00 2001 From: Sam Toyer Date: Fri, 26 Apr 2019 17:49:53 -0700 Subject: [PATCH] [rllib] TD3/DDPG improvements and MuJoCo benchmarks (#4694) * [rllib] Separate optimisers for DDPG actor & crit. * [rllib] Better names for DDPG variables & options Config changes: - noise_scale -> exploration_ou_noise_scale - exploration_theta -> exploration_ou_theta - exploration_sigma -> exploration_ou_sigma - act_noise -> exploration_gaussian_sigma - noise_clip -> target_noise_clip * [rllib] Make DDPG less class-y Used functions to replace three classes with only an __init__ method & a handful of unrelated attributes. * [rllib] Refactor DDPG noise * [rllib] Unify DDPG exploration annealing Added option "exploration_should_anneal" to enable linear annealing of exploration noise. By default this is off, for consistency with DDPG & TD3 papers. Also renamed "exploration_final_eps" to "exploration_final_scale" (that name seems to have been carried over from DQN, and doesn't really make sense here). Finally, tried to rename "eps" to "noise_scale" wherever possible. --- doc/source/rllib-algorithms.rst | 2 +- python/ray/rllib/agents/ddpg/__init__.py | 3 +- python/ray/rllib/agents/ddpg/ddpg.py | 126 ++-- .../rllib/agents/ddpg/ddpg_policy_graph.py | 584 +++++++++--------- python/ray/rllib/agents/ddpg/td3.py | 57 ++ python/ray/rllib/agents/registry.py | 6 + .../rllib/tests/test_checkpoint_restore.py | 3 +- .../ray/rllib/tests/test_supported_spaces.py | 11 +- .../tuned_examples/halfcheetah-ddpg.yaml | 18 +- .../tuned_examples/invertedpendulum-td3.yaml | 22 + .../mountaincarcontinuous-apex-ddpg.yaml | 4 +- .../mountaincarcontinuous-ddpg.yaml | 18 +- .../ray/rllib/tuned_examples/mujoco-td3.yaml | 24 + .../tuned_examples/pendulum-apex-ddpg.yaml | 2 + .../rllib/tuned_examples/pendulum-ddpg.yaml | 18 +- .../rllib/tuned_examples/pendulum-td3.yaml | 57 +- 16 files changed, 557 insertions(+), 398 deletions(-) create mode 100644 python/ray/rllib/agents/ddpg/td3.py create mode 100644 python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml create mode 100644 python/ray/rllib/tuned_examples/mujoco-td3.yaml diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index fd07bdc1b..9ee610847 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -142,7 +142,7 @@ Deep Deterministic Policy Gradients (DDPG, TD3) `[paper] `__ `[implementation] `__ DDPG is implemented similarly to DQN (below). The algorithm can be scaled by increasing the number of workers, switching to AsyncGradientsOptimizer, or using Ape-X. The improvements from `TD3 `__ are available though not enabled by default. -Tuned examples: `Pendulum-v0 `__, `TD3 configuration `__, `MountainCarContinuous-v0 `__, `HalfCheetah-v2 `__ +Tuned examples: `Pendulum-v0 `__, `MountainCarContinuous-v0 `__, `HalfCheetah-v2 `__, `TD3 Pendulum-v0 `__, `TD3 InvertedPendulum-v2 `__, `TD3 Mujoco suite (Ant-v2, HalfCheetah-v2, Hopper-v2, Walker2d-v2) `__. **DDPG-specific configs** (see also `common configs `__): diff --git a/python/ray/rllib/agents/ddpg/__init__.py b/python/ray/rllib/agents/ddpg/__init__.py index 5d2099187..9b90ca842 100644 --- a/python/ray/rllib/agents/ddpg/__init__.py +++ b/python/ray/rllib/agents/ddpg/__init__.py @@ -4,6 +4,7 @@ from __future__ import print_function from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG +from ray.rllib.agents.ddpg.td3 import TD3Trainer from ray.rllib.utils import renamed_class ApexDDPGAgent = renamed_class(ApexDDPGTrainer) @@ -11,5 +12,5 @@ DDPGAgent = renamed_class(DDPGTrainer) __all__ = [ "DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer", - "DEFAULT_CONFIG" + "TD3Trainer", "DEFAULT_CONFIG" ] diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py index e5031ca11..7a140beee 100644 --- a/python/ray/rllib/agents/ddpg/ddpg.py +++ b/python/ray/rllib/agents/ddpg/ddpg.py @@ -13,19 +13,21 @@ from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule DEFAULT_CONFIG = with_common_config({ # === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks === # TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html + # In addition to settings below, you can use "exploration_noise_type" and + # "exploration_gauss_act_noise" to get IID Gaussian exploration noise + # instead of OU exploration noise. # twin Q-net "twin_q": False, # delayed policy update "policy_delay": 1, # target policy smoothing - # this also forces the use of gaussian instead of OU noise for exploration + # (this also replaces OU exploration noise with IID Gaussian exploration + # noise, for now) "smooth_target_policy": False, - # gaussian stddev of act noise - "act_noise": 0.1, - # gaussian stddev of target noise + # gaussian stddev of target action noise for smoothing "target_noise": 0.2, # target noise limit (bound) - "noise_clip": 0.5, + "target_noise_clip": 0.5, # === Evaluation === # Evaluate with epsilon=0 every `evaluation_interval` training iterations. @@ -37,42 +39,64 @@ DEFAULT_CONFIG = with_common_config({ "evaluation_num_episodes": 10, # === Model === - # Postprocess the policy network model output with these hidden layers - "actor_hiddens": [64, 64], - # Hidden layers activation of the policy network + # Apply a state preprocessor with spec given by the "model" config option + # (like other RL algorithms). This is mostly useful if you have a weird + # observation shape, like an image. Disabled by default. + "use_state_preprocessor": False, + # Postprocess the policy network model output with these hidden layers. If + # use_state_preprocessor is False, then these will be the *only* hidden + # layers in the network. + "actor_hiddens": [400, 300], + # Hidden layers activation of the postprocessing stage of the policy + # network "actor_hidden_activation": "relu", - # Postprocess the critic network model output with these hidden layers - "critic_hiddens": [64, 64], - # Hidden layers activation of the critic network + # Postprocess the critic network model output with these hidden layers; + # again, if use_state_preprocessor is True, then the state will be + # preprocessed by the model specified with the "model" config option first. + "critic_hiddens": [400, 300], + # Hidden layers activation of the postprocessing state of the critic. "critic_hidden_activation": "relu", # N-step Q learning "n_step": 1, # === Exploration === - # Max num timesteps for annealing schedules. Exploration is annealed from - # 1.0 to exploration_fraction over this number of timesteps scaled by - # exploration_fraction + # Turns on annealing schedule for exploration noise. Exploration is + # annealed from 1.0 to exploration_final_eps over schedule_max_timesteps + # scaled by exploration_fraction. Original DDPG and TD3 papers do not + # anneal noise, so this is False by default. + "exploration_should_anneal": False, + # Max num timesteps for annealing schedules. "schedule_max_timesteps": 100000, # Number of env steps to optimize for before returning "timesteps_per_iteration": 1000, # Fraction of entire training period over which the exploration rate is # annealed "exploration_fraction": 0.1, - # Final value of random action probability - "exploration_final_eps": 0.02, - # OU-noise scale - "noise_scale": 0.1, - # theta - "exploration_theta": 0.15, - # sigma - "exploration_sigma": 0.2, - # Update the target network every `target_network_update_freq` steps. - "target_network_update_freq": 0, - # Update the target by \tau * policy + (1-\tau) * target_policy - "tau": 0.002, + # Final scaling multiplier for action noise (initial is 1.0) + "exploration_final_scale": 0.02, + # valid values: "ou" (time-correlated, like original DDPG paper), + # "gaussian" (IID, like TD3 paper) + "exploration_noise_type": "ou", + # OU-noise scale; this can be used to scale down magnitude of OU noise + # before adding to actions (requires "exploration_noise_type" to be "ou") + "exploration_ou_noise_scale": 0.1, + # theta for OU + "exploration_ou_theta": 0.15, + # sigma for OU + "exploration_ou_sigma": 0.2, + # gaussian stddev of act noise for exploration (requires + # "exploration_noise_type" to be "gaussian") + "exploration_gaussian_sigma": 0.1, # If True parameter space noise will be used for exploration # See https://blog.openai.com/better-exploration-with-parameter-noise/ "parameter_noise": False, + # Until this many timesteps have elapsed, the agent's policy will be + # ignored & it will instead take uniform random actions. Can be used in + # conjunction with learning_starts (which controls when the first + # optimization step happens) to decrease dependence of exploration & + # optimization on initial policy parameters. Note that this will be + # disabled when the action noise scale is set to 0 (e.g during evaluation). + "pure_exploration_steps": 1000, # === Replay buffer === # Size of the replay buffer. Note that if async_updates is set, then @@ -90,11 +114,14 @@ DEFAULT_CONFIG = with_common_config({ "compress_observations": False, # === Optimization === - # Learning rate for adam optimizer. - # Instead of using two optimizers, we use two different loss coefficients - "lr": 1e-3, - "actor_loss_coeff": 0.1, - "critic_loss_coeff": 1.0, + # Learning rate for the critic (Q-function) optimizer. + "critic_lr": 1e-3, + # Learning rate for the actor (policy) optimizer. + "actor_lr": 1e-3, + # Update the target network every `target_network_update_freq` steps. + "target_network_update_freq": 0, + # Update the target by \tau * policy + (1-\tau) * target_policy + "tau": 0.002, # If True, use huber loss instead of squared loss for critic network # Conventionally, no need to clip gradients if using a huber loss "use_huber": False, @@ -117,7 +144,7 @@ DEFAULT_CONFIG = with_common_config({ # === Parallelism === # Number of workers for collecting samples with. This only makes sense # to increase if your environment is particularly slow to sample, or if - # you"re using the Async or Ape-X optimizers. + # you're using the Async or Ape-X optimizers. "num_workers": 0, # Optimizer class to use. "optimizer_class": "SyncReplayOptimizer", @@ -138,26 +165,41 @@ class DDPGTrainer(DQNTrainer): _default_config = DEFAULT_CONFIG _policy_graph = DDPGPolicyGraph + @override(DQNTrainer) + def _train(self): + pure_expl_steps = self.config["pure_exploration_steps"] + if pure_expl_steps: + # tell workers whether they should do pure exploration + only_explore = self.global_timestep < pure_expl_steps + self.local_evaluator.foreach_trainable_policy( + lambda p, _: p.set_pure_exploration_phase(only_explore)) + for e in self.remote_evaluators: + e.foreach_trainable_policy.remote( + lambda p, _: p.set_pure_exploration_phase(only_explore)) + return super(DDPGTrainer, self)._train() + @override(DQNTrainer) def _make_exploration_schedule(self, worker_index): - # Override DQN's schedule to take into account `noise_scale` + # Override DQN's schedule to take into account + # `exploration_ou_noise_scale` if self.config["per_worker_exploration"]: assert self.config["num_workers"] > 1, \ "This requires multiple workers" if worker_index >= 0: - exponent = ( - 1 + - worker_index / float(self.config["num_workers"] - 1) * 7) - return ConstantSchedule( - self.config["noise_scale"] * 0.4**exponent) + # FIXME: what do magic constants mean? (0.4, 7) + max_index = float(self.config["num_workers"] - 1) + exponent = 1 + worker_index / max_index * 7 + return ConstantSchedule(0.4**exponent) else: # local ev should have zero exploration so that eval rollouts # run properly return ConstantSchedule(0.0) - else: + elif self.config["exploration_should_anneal"]: return LinearSchedule( schedule_timesteps=int(self.config["exploration_fraction"] * self.config["schedule_max_timesteps"]), - initial_p=self.config["noise_scale"] * 1.0, - final_p=self.config["noise_scale"] * - self.config["exploration_final_eps"]) + initial_p=1.0, + final_p=self.config["exploration_final_scale"]) + else: + # *always* add exploration noise + return ConstantSchedule(1.0) diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py index ad4f879e7..9304cbe0b 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy_graph.py @@ -19,80 +19,18 @@ from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph -ACTION_SCOPE = "a_func" -POLICY_SCOPE = "p_func" -POLICY_TARGET_SCOPE = "target_p_func" -Q_SCOPE = "q_func" -Q_TARGET_SCOPE = "target_q_func" -TWIN_Q_SCOPE = "twin_q_func" -TWIN_Q_TARGET_SCOPE = "twin_target_q_func" +ACTION_SCOPE = "action" +POLICY_SCOPE = "policy" +POLICY_TARGET_SCOPE = "target_policy" +Q_SCOPE = "critic" +Q_TARGET_SCOPE = "target_critic" +TWIN_Q_SCOPE = "twin_critic" +TWIN_Q_TARGET_SCOPE = "twin_target_critic" # Importance sampling weights for prioritized replay PRIO_WEIGHTS = "weights" -class ActorCriticLoss(object): - def __init__(self, - q_t, - q_tp1, - q_tp0, - importance_weights, - rewards, - done_mask, - twin_q_t, - twin_q_tp1, - actor_loss_coeff=0.1, - critic_loss_coeff=1.0, - gamma=0.99, - n_step=1, - use_huber=False, - huber_threshold=1.0, - twin_q=False, - policy_delay=1): - - q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) - if twin_q: - twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) - q_tp1 = tf.minimum(q_tp1, twin_q_tp1) - - q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) - q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best - - # compute RHS of bellman equation - q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked - - # compute the error (potentially clipped) - if twin_q: - td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) - twin_td_error = twin_q_t_selected - tf.stop_gradient( - q_t_selected_target) - self.td_error = td_error + twin_td_error - if use_huber: - errors = _huber_loss(td_error, huber_threshold) + _huber_loss( - twin_td_error, huber_threshold) - else: - errors = 0.5 * tf.square(td_error) + 0.5 * tf.square( - twin_td_error) - else: - self.td_error = ( - q_t_selected - tf.stop_gradient(q_t_selected_target)) - if use_huber: - errors = _huber_loss(self.td_error, huber_threshold) - else: - errors = 0.5 * tf.square(self.td_error) - - self.critic_loss = critic_loss_coeff * tf.reduce_mean( - importance_weights * errors) - - # for policy gradient, update policy net one time v.s. - # update critic net `policy_delay` time(s) - global_step = tf.train.get_or_create_global_step() - policy_delay_mask = tf.to_float( - tf.equal(tf.mod(global_step, policy_delay), 0)) - self.actor_loss = (-1.0 * actor_loss_coeff * policy_delay_mask * - tf.reduce_mean(q_tp0)) - - class DDPGPostprocessing(object): """Implements n-step learning and param noise adjustments.""" @@ -113,12 +51,13 @@ class DDPGPostprocessing(object): feed_dict={ self.cur_observations: states, self.stochastic: False, - self.eps: .0 + self.noise_scale: .0, + self.pure_exploration_phase: False, }) distance_in_action_space = np.sqrt( np.mean(np.square(clean_actions - noisy_actions))) self.pi_distance = distance_in_action_space - if distance_in_action_space < self.config["exploration_sigma"]: + if distance_in_action_space < self.config["exploration_ou_sigma"]: self.parameter_noise_sigma_val *= 1.01 else: self.parameter_noise_sigma_val /= 1.01 @@ -128,107 +67,6 @@ class DDPGPostprocessing(object): return _postprocess_dqn(self, sample_batch) -class PolicyNetwork(object): - """Maps an observations (i.e., state) to an action where each entry takes - value from (0, 1) due to the sigmoid function.""" - - def __init__(self, - model, - dim_actions, - hiddens=[64, 64], - activation="relu", - parameter_noise=False): - action_out = model.last_layer - activation = tf.nn.__dict__[activation] - for hidden in hiddens: - action_out = layers.fully_connected( - action_out, - num_outputs=hidden, - activation_fn=activation, - normalizer_fn=layers.layer_norm if parameter_noise else None) - # Use sigmoid layer to bound values within (0, 1) - # shape of action_scores is [batch_size, dim_actions] - self.action_scores = layers.fully_connected( - action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid) - self.model = model - - -class ActionNetwork(object): - """Acts as a stochastic policy for inference, but a deterministic policy - for training, thus ignoring the batch_size issue when constructing a - stochastic action.""" - - def __init__(self, - p_values, - low_action, - high_action, - stochastic, - eps, - theta=0.15, - sigma=0.2, - use_gaussian_noise=False, - act_noise=0.1, - is_target=False, - target_noise=0.2, - noise_clip=0.5, - parameter_noise=False): - - # shape is [None, dim_action] - deterministic_actions = ( - (high_action - low_action) * p_values + low_action) - - if use_gaussian_noise: - if is_target: - normal_sample = tf.random_normal( - tf.shape(deterministic_actions), stddev=target_noise) - normal_sample = tf.clip_by_value(normal_sample, -noise_clip, - noise_clip) - stochastic_actions = tf.clip_by_value( - deterministic_actions + normal_sample, low_action, - high_action) - else: - normal_sample = tf.random_normal( - tf.shape(deterministic_actions), stddev=act_noise) - stochastic_actions = tf.clip_by_value( - deterministic_actions + normal_sample, low_action, - high_action) - else: - exploration_sample = tf.get_variable( - name="ornstein_uhlenbeck", - dtype=tf.float32, - initializer=low_action.size * [.0], - trainable=False) - normal_sample = tf.random_normal( - shape=[low_action.size], mean=0.0, stddev=1.0) - exploration_value = tf.assign_add( - exploration_sample, - theta * (.0 - exploration_sample) + sigma * normal_sample) - stochastic_actions = tf.clip_by_value( - deterministic_actions + - eps * (high_action - low_action) * exploration_value, - low_action, high_action) - - self.actions = tf.cond( - tf.logical_and(stochastic, not parameter_noise), - lambda: stochastic_actions, lambda: deterministic_actions) - - -class QNetwork(object): - def __init__(self, - model, - action_inputs, - hiddens=[64, 64], - activation="relu"): - q_out = tf.concat([model.last_layer, action_inputs], axis=1) - activation = tf.nn.__dict__[activation] - for hidden in hiddens: - q_out = layers.fully_connected( - q_out, num_outputs=hidden, activation_fn=activation) - self.value = layers.fully_connected( - q_out, num_outputs=1, activation_fn=None) - self.model = model - - class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) @@ -238,7 +76,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): action_space)) self.config = config - self.cur_epsilon = 1.0 + self.cur_noise_scale = 1.0 + self.cur_pure_exploration_phase = False self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high @@ -246,30 +85,38 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): # create global step for counting the number of update operations self.global_step = tf.train.get_or_create_global_step() + # use separate optimizers for actor & critic + self._actor_optimizer = tf.train.AdamOptimizer( + learning_rate=self.config["actor_lr"]) + self._critic_optimizer = tf.train.AdamOptimizer( + learning_rate=self.config["critic_lr"]) + # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") - self.eps = tf.placeholder(tf.float32, (), name="eps") + self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale") + self.pure_exploration_phase = tf.placeholder( + tf.bool, (), name="pure_exploration_phase") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") - # Actor: P (policy) network with tf.variable_scope(POLICY_SCOPE) as scope: - p_values, self.p_model = self._build_p_network( + policy_out, self.policy_model = self._build_policy_network( self.cur_observations, observation_space, action_space) - self.p_func_vars = _scope_vars(scope.name) + self.policy_vars = _scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ - var for var in self.p_func_vars if "LayerNorm" not in var.name + var for var in self.policy_vars if "LayerNorm" not in var.name ]) # Action outputs with tf.variable_scope(ACTION_SCOPE): - self.output_actions = self._build_action_network( - p_values, self.stochastic, self.eps) + self.output_actions = self._add_exploration_noise( + policy_out, self.stochastic, self.noise_scale, + self.pure_exploration_phase, action_space) if self.config["smooth_target_policy"]: self.reset_noise_op = tf.no_op() @@ -293,37 +140,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") - # p network evaluation + # policy network evaluation with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - self.p_t, _ = self._build_p_network(self.obs_t, observation_space, - action_space) - p_batchnorm_update_ops = list( + self.policy_t, _ = self._build_policy_network( + self.obs_t, observation_space, action_space) + policy_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) - # target p network evaluation + # target policy network evaluation with tf.variable_scope(POLICY_TARGET_SCOPE) as scope: - p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space, - action_space) - target_p_func_vars = _scope_vars(scope.name) + policy_tp1, _ = self._build_policy_network( + self.obs_tp1, observation_space, action_space) + target_policy_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(ACTION_SCOPE, reuse=True): - output_actions = self._build_action_network( - self.p_t, - stochastic=tf.constant(value=False, dtype=tf.bool), - eps=.0) - output_actions_estimated = self._build_action_network( - p_tp1, - stochastic=tf.constant( - value=self.config["smooth_target_policy"], dtype=tf.bool), - eps=.0, - is_target=True) + if config["smooth_target_policy"]: + target_noise_clip = self.config["target_noise_clip"] + clipped_normal_sample = tf.clip_by_value( + tf.random_normal( + tf.shape(policy_tp1), + stddev=self.config["target_noise"]), + -target_noise_clip, target_noise_clip) + policy_tp1_smoothed = tf.clip_by_value( + policy_tp1 + clipped_normal_sample, action_space.low, + action_space.high) + else: + # no smoothing, just use deterministic actions + policy_tp1_smoothed = policy_tp1 # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: + # Q-values for given actions & observations in given current q_t, self.q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) @@ -333,8 +184,9 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): - q_tp0, _ = self._build_q_network(self.obs_t, observation_space, - action_space, output_actions) + # Q-values for current policy (no noise) in given current state + q_t_det_policy, _ = self._build_q_network( + self.obs_t, observation_space, action_space, self.policy_t) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( @@ -343,38 +195,41 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) - # target q network evalution + # target q network evaluation with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, - action_space, - output_actions_estimated) + action_space, policy_tp1_smoothed) target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space, - output_actions_estimated) + policy_tp1_smoothed) twin_target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: - self.loss = self._build_actor_critic_loss( - q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) + self.critic_loss, self.actor_loss, self.td_error \ + = self._build_actor_critic_loss( + q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t, + twin_q_tp1=twin_q_tp1) else: - self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0) + self.critic_loss, self.actor_loss, self.td_error \ + = self._build_actor_critic_loss( + q_t, q_tp1, q_t_det_policy) if config["l2_reg"] is not None: - for var in self.p_func_vars: + for var in self.policy_vars: if "bias" not in var.name: - self.loss.actor_loss += ( + self.actor_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: - self.loss.critic_loss += ( + self.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: - self.loss.critic_loss += ( + self.critic_loss += ( config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to @@ -396,8 +251,8 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( - sorted(self.p_func_vars, key=lambda v: v.name), - sorted(target_p_func_vars, key=lambda v: v.name)): + sorted(self.policy_vars, key=lambda v: v.name), + sorted(target_policy_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) @@ -414,14 +269,15 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): ] input_dict = dict(self.loss_inputs) - # Model self-supervised losses - self.loss.actor_loss = self.p_model.custom_loss( - self.loss.actor_loss, input_dict) - self.loss.critic_loss = self.q_model.custom_loss( - self.loss.critic_loss, input_dict) - if self.config["twin_q"]: - self.loss.critic_loss = self.twin_q_model.custom_loss( - self.loss.critic_loss, input_dict) + if self.config["use_state_preprocessor"]: + # Model self-supervised losses + self.actor_loss = self.policy_model.custom_loss( + self.actor_loss, input_dict) + self.critic_loss = self.q_model.custom_loss( + self.critic_loss, input_dict) + if self.config["twin_q"]: + self.critic_loss = self.twin_q_model.custom_loss( + self.critic_loss, input_dict) TFPolicyGraph.__init__( self, @@ -430,62 +286,92 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, - loss=self.loss.actor_loss + self.loss.critic_loss, + loss=self.actor_loss + self.critic_loss, loss_inputs=self.loss_inputs, - update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops) + update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( - tf.group(q_tp0, q_tp1), self.sess) + tf.group(q_t_det_policy, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0) @override(TFPolicyGraph) def optimizer(self): - return tf.train.AdamOptimizer(learning_rate=self.config["lr"]) + # we don't use this because we have two separate optimisers + return None + + @override(TFPolicyGraph) + def build_apply_op(self, optimizer, grads_and_vars): + # for policy gradient, update policy net one time v.s. + # update critic net `policy_delay` time(s) + should_apply_actor_opt = tf.equal( + tf.mod(self.global_step, self.config["policy_delay"]), 0) + + def make_apply_op(): + return self._actor_optimizer.apply_gradients( + self._actor_grads_and_vars) + + actor_op = tf.cond( + should_apply_actor_opt, + true_fn=make_apply_op, + false_fn=lambda: tf.no_op()) + critic_op = self._critic_optimizer.apply_gradients( + self._critic_grads_and_vars) + # increment global step & apply ops + with tf.control_dependencies([tf.assign_add(self.global_step, 1)]): + return tf.group(actor_op, critic_op) @override(TFPolicyGraph) def gradients(self, optimizer, loss): if self.config["grad_norm_clipping"] is not None: actor_grads_and_vars = _minimize_and_clip( - optimizer, - self.loss.actor_loss, - var_list=self.p_func_vars, + self._actor_optimizer, + self.actor_loss, + var_list=self.policy_vars, clip_val=self.config["grad_norm_clipping"]) critic_grads_and_vars = _minimize_and_clip( - optimizer, - self.loss.critic_loss, + self._critic_optimizer, + self.critic_loss, var_list=self.q_func_vars + self.twin_q_func_vars if self.config["twin_q"] else self.q_func_vars, clip_val=self.config["grad_norm_clipping"]) else: - actor_grads_and_vars = optimizer.compute_gradients( - self.loss.actor_loss, var_list=self.p_func_vars) - critic_grads_and_vars = optimizer.compute_gradients( - self.loss.critic_loss, - var_list=self.q_func_vars + self.twin_q_func_vars - if self.config["twin_q"] else self.q_func_vars) - actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars - if g is not None] - critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars - if g is not None] - grads_and_vars = actor_grads_and_vars + critic_grads_and_vars + actor_grads_and_vars = self._actor_optimizer.compute_gradients( + self.actor_loss, var_list=self.policy_vars) + if self.config["twin_q"]: + critic_vars = self.q_func_vars + self.twin_q_func_vars + else: + critic_vars = self.q_func_vars + critic_grads_and_vars = self._critic_optimizer.compute_gradients( + self.critic_loss, var_list=critic_vars) + # save these for later use in build_apply_op + self._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars + if g is not None] + self._critic_grads_and_vars = [(g, v) + for (g, v) in critic_grads_and_vars + if g is not None] + grads_and_vars = self._actor_grads_and_vars \ + + self._critic_grads_and_vars return grads_and_vars @override(TFPolicyGraph) def extra_compute_action_feed_dict(self): return { + # FIXME: what about turning off exploration? Isn't that a good + # idea? self.stochastic: True, - self.eps: self.cur_epsilon, + self.noise_scale: self.cur_noise_scale, + self.pure_exploration_phase: self.cur_pure_exploration_phase, } @override(TFPolicyGraph) def extra_compute_grad_fetches(self): return { - "td_error": self.loss.td_error, + "td_error": self.td_error, LEARNER_STATS_KEY: self.stats, } @@ -499,59 +385,192 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): @override(PolicyGraph) def get_state(self): - return [TFPolicyGraph.get_state(self), self.cur_epsilon] + return [ + TFPolicyGraph.get_state(self), self.cur_noise_scale, + self.cur_pure_exploration_phase + ] @override(PolicyGraph) def set_state(self, state): TFPolicyGraph.set_state(self, state[0]) self.set_epsilon(state[1]) + self.set_pure_exploration_phase(state[2]) def _build_q_network(self, obs, obs_space, action_space, actions): - q_net = QNetwork( - ModelCatalog.get_model({ + if self.config["use_state_preprocessor"]: + q_model = ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), - }, obs_space, action_space, 1, self.config["model"]), actions, - self.config["critic_hiddens"], - self.config["critic_hidden_activation"]) - return q_net.value, q_net.model + }, obs_space, action_space, 1, self.config["model"]) + q_out = tf.concat([q_model.last_layer, actions], axis=1) + else: + q_model = None + q_out = tf.concat([obs, actions], axis=1) - def _build_p_network(self, obs, obs_space, action_space): - policy_net = PolicyNetwork( - ModelCatalog.get_model({ + activation = getattr(tf.nn, self.config["critic_hidden_activation"]) + for hidden in self.config["critic_hiddens"]: + q_out = layers.fully_connected( + q_out, num_outputs=hidden, activation_fn=activation) + q_values = layers.fully_connected( + q_out, num_outputs=1, activation_fn=None) + + return q_values, q_model + + def _build_policy_network(self, obs, obs_space, action_space): + if self.config["use_state_preprocessor"]: + model = ModelCatalog.get_model({ "obs": obs, "is_training": self._get_is_training_placeholder(), - }, obs_space, action_space, 1, self.config["model"]), - self.dim_actions, self.config["actor_hiddens"], - self.config["actor_hidden_activation"], - self.config["parameter_noise"]) - return policy_net.action_scores, policy_net.model + }, obs_space, action_space, 1, self.config["model"]) + action_out = model.last_layer + else: + model = None + action_out = obs - def _build_action_network(self, p_values, stochastic, eps, - is_target=False): - return ActionNetwork( - p_values, self.low_action, self.high_action, stochastic, eps, - self.config["exploration_theta"], self.config["exploration_sigma"], - self.config["smooth_target_policy"], self.config["act_noise"], - is_target, self.config["target_noise"], - self.config["noise_clip"]).actions + activation = getattr(tf.nn, self.config["actor_hidden_activation"]) + normalizer_fn = layers.layer_norm if self.config["parameter_noise"] \ + else None + for hidden in self.config["actor_hiddens"]: + action_out = layers.fully_connected( + action_out, + num_outputs=hidden, + activation_fn=activation, + normalizer_fn=normalizer_fn) + action_out = layers.fully_connected( + action_out, num_outputs=self.dim_actions, activation_fn=None) + + # Use sigmoid to scale to [0,1], but also double magnitude of input to + # emulate behaviour of tanh activation used in DDPG and TD3 papers. + sigmoid_out = tf.nn.sigmoid(2 * action_out) + # Rescale to actual env policy scale + # (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to + # get same dims) + action_range = (action_space.high - action_space.low)[None] + low_action = action_space.low[None] + actions = action_range * sigmoid_out + low_action + + return actions, model + + def _add_exploration_noise(self, deterministic_actions, + should_be_stochastic, noise_scale, + enable_pure_exploration, action_space): + noise_type = self.config["exploration_noise_type"] + action_low = action_space.low + action_high = action_space.high + action_range = action_space.high - action_low + + def compute_stochastic_actions(): + def make_noisy_actions(): + # shape of deterministic_actions is [None, dim_action] + if noise_type == "gaussian": + # add IID Gaussian noise for exploration, TD3-style + normal_sample = noise_scale * tf.random_normal( + tf.shape(deterministic_actions), + stddev=self.config["exploration_gaussian_sigma"]) + stochastic_actions = tf.clip_by_value( + deterministic_actions + normal_sample, action_low, + action_high) + elif noise_type == "ou": + # add OU noise for exploration, DDPG-style + zero_acts = action_low.size * [.0] + exploration_sample = tf.get_variable( + name="ornstein_uhlenbeck", + dtype=tf.float32, + initializer=zero_acts, + trainable=False) + normal_sample = tf.random_normal( + shape=[action_low.size], mean=0.0, stddev=1.0) + ou_new = self.config["exploration_ou_theta"] \ + * -exploration_sample \ + + self.config["exploration_ou_sigma"] * normal_sample + exploration_value = tf.assign_add(exploration_sample, + ou_new) + base_scale = self.config["exploration_ou_noise_scale"] + noise = noise_scale * base_scale \ + * exploration_value * action_range + stochastic_actions = tf.clip_by_value( + deterministic_actions + noise, action_low, action_high) + else: + raise ValueError( + "Unknown noise type '%s' (try 'ou' or 'gaussian')" % + noise_type) + return stochastic_actions + + def make_uniform_random_actions(): + # pure random exploration option + uniform_random_actions = tf.random.uniform( + tf.shape(deterministic_actions)) + # rescale uniform random actions according to action range + tf_range = tf.constant(action_range[None], dtype="float32") + tf_low = tf.constant(action_low[None], dtype="float32") + uniform_random_actions = uniform_random_actions * tf_range \ + + tf_low + return uniform_random_actions + + stochastic_actions = tf.cond( + # need to condition on noise_scale > 0 because zeroing + # noise_scale is how evaluator signals no noise should be used + # (this is ugly and should be fixed by adding an "eval_mode" + # config flag or something) + tf.logical_and(enable_pure_exploration, noise_scale > 0), + true_fn=make_uniform_random_actions, + false_fn=make_noisy_actions) + return stochastic_actions + + enable_stochastic = tf.logical_and(should_be_stochastic, + not self.config["parameter_noise"]) + actions = tf.cond(enable_stochastic, compute_stochastic_actions, + lambda: deterministic_actions) + return actions def _build_actor_critic_loss(self, q_t, q_tp1, - q_tp0, + q_t_det_policy, twin_q_t=None, twin_q_tp1=None): - return ActorCriticLoss( - q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t, - self.done_mask, twin_q_t, twin_q_tp1, - self.config["actor_loss_coeff"], self.config["critic_loss_coeff"], - self.config["gamma"], self.config["n_step"], - self.config["use_huber"], self.config["huber_threshold"], - self.config["twin_q"]) + twin_q = self.config["twin_q"] + gamma = self.config["gamma"] + n_step = self.config["n_step"] + use_huber = self.config["use_huber"] + huber_threshold = self.config["huber_threshold"] + + q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) + if twin_q: + twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1) + q_tp1 = tf.minimum(q_tp1, twin_q_tp1) + + q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) + q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best + + # compute RHS of bellman equation + q_t_selected_target = tf.stop_gradient( + self.rew_t + gamma**n_step * q_tp1_best_masked) + + # compute the error (potentially clipped) + if twin_q: + td_error = q_t_selected - q_t_selected_target + twin_td_error = twin_q_t_selected - q_t_selected_target + td_error = td_error + twin_td_error + if use_huber: + errors = _huber_loss(td_error, huber_threshold) \ + + _huber_loss(twin_td_error, huber_threshold) + else: + errors = 0.5 * tf.square(td_error) + 0.5 * tf.square( + twin_td_error) + else: + td_error = q_t_selected - q_t_selected_target + if use_huber: + errors = _huber_loss(td_error, huber_threshold) + else: + errors = 0.5 * tf.square(td_error) + + critic_loss = tf.reduce_mean(self.importance_weights * errors) + actor_loss = -tf.reduce_mean(q_t_det_policy) + return critic_loss, actor_loss, td_error def _build_parameter_noise(self, pnet_params): - self.parameter_noise_sigma_val = self.config["exploration_sigma"] + self.parameter_noise_sigma_val = self.config["exploration_ou_sigma"] self.parameter_noise_sigma = tf.get_variable( initializer=tf.constant_initializer( self.parameter_noise_sigma_val), @@ -590,7 +609,7 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights): td_err = self.sess.run( - self.loss.td_error, + self.td_error, feed_dict={ self.obs_t: [np.array(ob) for ob in obs_t], self.act_t: act_t, @@ -610,9 +629,16 @@ class DDPGPolicyGraph(DDPGPostprocessing, TFPolicyGraph): # support both hard and soft sync def update_target(self, tau=None): + tau = tau or self.tau_value return self.sess.run( - self.update_target_expr, - feed_dict={self.tau: tau or self.tau_value}) + self.update_target_expr, feed_dict={self.tau: tau}) def set_epsilon(self, epsilon): - self.cur_epsilon = epsilon + # set_epsilon is called by optimizer to anneal exploration as + # necessary, and to turn it off during evaluation. The "epsilon" part + # is a carry-over from DQN, which uses epsilon-greedy exploration + # rather than adding action noise to the output of a policy network. + self.cur_noise_scale = epsilon + + def set_pure_exploration_phase(self, pure_exploration_phase): + self.cur_pure_exploration_phase = pure_exploration_phase diff --git a/python/ray/rllib/agents/ddpg/td3.py b/python/ray/rllib/agents/ddpg/td3.py new file mode 100644 index 000000000..714c39c6b --- /dev/null +++ b/python/ray/rllib/agents/ddpg/td3.py @@ -0,0 +1,57 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \ + DEFAULT_CONFIG as DDPG_CONFIG +from ray.rllib.utils import merge_dicts + +TD3_DEFAULT_CONFIG = merge_dicts( + DDPG_CONFIG, + { + # largest changes: twin Q functions, delayed policy updates, and target + # smoothing + "twin_q": True, + "policy_delay": 2, + "smooth_target_policy": True, + "target_noise": 0.2, + "target_noise_clip": 0.5, + + # other changes & things we want to keep fixed: IID Gaussian + # exploration noise, larger actor learning rate, no l2 regularisation, + # no Huber loss, etc. + "exploration_should_anneal": False, + "exploration_noise_type": "gaussian", + "exploration_gaussian_sigma": 0.1, + "learning_starts": 10000, + "pure_exploration_steps": 10000, + "actor_hiddens": [400, 300], + "critic_hiddens": [400, 300], + "n_step": 1, + "gamma": 0.99, + "actor_lr": 1e-3, + "critic_lr": 1e-3, + "l2_reg": 0.0, + "tau": 5e-3, + "train_batch_size": 100, + "use_huber": False, + "target_network_update_freq": 0, + "optimizer_class": "SyncReplayOptimizer", + "num_workers": 0, + "num_gpus_per_worker": 0, + "per_worker_exploration": False, + "worker_side_prioritization": False, + "buffer_size": 1000000, + "prioritized_replay": False, + "clip_rewards": False, + "use_state_preprocessor": False, + }, +) + + +class TD3Trainer(DDPGTrainer): + """A more stable successor to TD3. By default, this uses a near-identical + configuration to that reported in the TD3 paper.""" + + _name = "TD3" + _default_config = TD3_DEFAULT_CONFIG diff --git a/python/ray/rllib/agents/registry.py b/python/ray/rllib/agents/registry.py index 7b133fa81..aa70275ad 100644 --- a/python/ray/rllib/agents/registry.py +++ b/python/ray/rllib/agents/registry.py @@ -34,6 +34,11 @@ def _import_apex_ddpg(): return ddpg.ApexDDPGTrainer +def _import_td3(): + from ray.rllib.agents import ddpg + return ddpg.TD3Trainer + + def _import_ppo(): from ray.rllib.agents import ppo return ppo.PPOTrainer @@ -87,6 +92,7 @@ def _import_marwil(): ALGORITHMS = { "DDPG": _import_ddpg, "APEX_DDPG": _import_apex_ddpg, + "TD3": _import_td3, "PPO": _import_ppo, "ES": _import_es, "ARS": _import_ars, diff --git a/python/ray/rllib/tests/test_checkpoint_restore.py b/python/ray/rllib/tests/test_checkpoint_restore.py index 3b16ad1dd..68fe6e7cb 100644 --- a/python/ray/rllib/tests/test_checkpoint_restore.py +++ b/python/ray/rllib/tests/test_checkpoint_restore.py @@ -40,7 +40,8 @@ CONFIGS = { }, }, "DDPG": { - "noise_scale": 0.0, + "pure_exploration_steps": 0, + "exploration_ou_noise_scale": 0.0, "timesteps_per_iteration": 100 }, "PPO": { diff --git a/python/ray/rllib/tests/test_supported_spaces.py b/python/ray/rllib/tests/test_supported_spaces.py index 7d59a04fb..c3ea442c8 100644 --- a/python/ray/rllib/tests/test_supported_spaces.py +++ b/python/ray/rllib/tests/test_supported_spaces.py @@ -116,8 +116,9 @@ class ModelSupportedSpaces(unittest.TestCase): check_support("APPO", {"num_gpus": 0, "vtrace": False}, stats) check_support( "DDPG", { - "noise_scale": 100.0, - "timesteps_per_iteration": 1 + "exploration_ou_noise_scale": 100.0, + "timesteps_per_iteration": 1, + "use_state_preprocessor": True, }, stats, check_bounds=True) @@ -188,6 +189,7 @@ class ModelSupportedSpaces(unittest.TestCase): "min_iter_time_s": 1, "learning_starts": 1000, "target_network_update_freq": 100, + "use_state_preprocessor": True, }) check_support_multiagent("IMPALA", {"num_gpus": 0}) check_support_multiagent("DQN", {"timesteps_per_iteration": 1}) @@ -206,7 +208,10 @@ class ModelSupportedSpaces(unittest.TestCase): "sgd_minibatch_size": 1, }) check_support_multiagent("PG", {"num_workers": 1, "optimizer": {}}) - check_support_multiagent("DDPG", {"timesteps_per_iteration": 1}) + check_support_multiagent("DDPG", { + "timesteps_per_iteration": 1, + "use_state_preprocessor": True, + }) if __name__ == "__main__": diff --git a/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml b/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml index f02399ab3..6a4bd52e7 100644 --- a/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml @@ -15,13 +15,14 @@ halfcheetah-ddpg: env_config: {} # === Exploration === + exploration_should_anneal: True schedule_max_timesteps: 100000 timesteps_per_iteration: 1000 exploration_fraction: 0.1 - exploration_final_eps: 0.02 - noise_scale: 0.1 - exploration_theta: 0.15 - exploration_sigma: 0.2 + exploration_final_scale: 0.02 + exploration_ou_noise_scale: 0.1 + exploration_ou_theta: 0.15 + exploration_ou_sigma: 0.2 target_network_update_freq: 0 tau: 0.001 @@ -34,9 +35,8 @@ halfcheetah-ddpg: clip_rewards: False # === Optimization === - lr: 0.001 - actor_loss_coeff: 0.1 - critic_loss_coeff: 1.0 + actor_lr: 0.001 + critic_lr: 0.001 use_huber: False huber_threshold: 1.0 l2_reg: 0.000001 @@ -50,3 +50,7 @@ halfcheetah-ddpg: optimizer_class: "SyncReplayOptimizer" per_worker_exploration: False worker_side_prioritization: False + + # === Evaluation === + evaluation_interval: 5 + evaluation_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml b/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml new file mode 100644 index 000000000..f215c0300 --- /dev/null +++ b/python/ray/rllib/tuned_examples/invertedpendulum-td3.yaml @@ -0,0 +1,22 @@ +invertedpendulum-td3: + # This is a TD3 with stopping conditions and network size tuned specifically + # for InvertedPendulum. Should be able to reach 1,000 reward (the maximum + # achievable) in 10,000 to 20,000 steps. + env: InvertedPendulum-v2 + run: TD3 + stop: + episode_reward_mean: 9999.9 + time_total_s: 900 # 15 minutes + timesteps_total: 1000000 + config: + # === Model === + actor_hiddens: [32, 32] + critic_hiddens: [32, 32] + + # === Exploration === + learning_starts: 1000 + pure_exploration_steps: 1000 + + # === Evaluation === + evaluation_interval: 1 + evaluation_num_episodes: 5 diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml index 82947d872..9e8923ffe 100644 --- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-apex-ddpg.yaml @@ -7,7 +7,9 @@ mountaincarcontinuous-apex-ddpg: config: clip_rewards: False num_workers: 16 - noise_scale: 1.0 + exploration_ou_noise_scale: 1.0 n_step: 3 target_network_update_freq: 50000 tau: 1.0 + evaluation_interval: 5 + evaluation_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml index e74b2e0f1..3a8f61229 100644 --- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml @@ -15,13 +15,14 @@ mountaincarcontinuous-ddpg: env_config: {} # === Exploration === + exploration_should_anneal: True schedule_max_timesteps: 100000 timesteps_per_iteration: 1000 exploration_fraction: 0.4 - exploration_final_eps: 0.02 - noise_scale: 0.75 - exploration_theta: 0.15 - exploration_sigma: 0.2 + exploration_final_scale: 0.02 + exploration_ou_noise_scale: 0.75 + exploration_ou_theta: 0.15 + exploration_ou_sigma: 0.2 target_network_update_freq: 0 tau: 0.01 @@ -34,9 +35,8 @@ mountaincarcontinuous-ddpg: clip_rewards: False # === Optimization === - lr: 0.001 - actor_loss_coeff: 0.1 - critic_loss_coeff: 1.0 + actor_lr: 0.001 + critic_lr: 0.001 use_huber: False huber_threshold: 1.0 l2_reg: 0.00001 @@ -50,3 +50,7 @@ mountaincarcontinuous-ddpg: optimizer_class: "SyncReplayOptimizer" per_worker_exploration: False worker_side_prioritization: False + + # === Evaluation === + evaluation_interval: 5 + evaluation_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/mujoco-td3.yaml b/python/ray/rllib/tuned_examples/mujoco-td3.yaml new file mode 100644 index 000000000..8f626b40b --- /dev/null +++ b/python/ray/rllib/tuned_examples/mujoco-td3.yaml @@ -0,0 +1,24 @@ +mujoco-td3: + # Solve latest versions of the four hardest Mujoco tasks benchmarked in the + # original TD3 paper. Average return over 10 trials at end of 1,000,000 + # timesteps (taken from Table 2 of the paper) are given in parens at the end + # of reach environment name. + # + # Paper is at https://arxiv.org/pdf/1802.09477.pdf + env: + grid_search: + - HalfCheetah-v2 # (9,532.99) + - Hopper-v2 # (3,304.75) + - Walker2d-v2 # (4,565.24) + - Ant-v2 # (4,185.06) + run: TD3 + stop: + timesteps_total: 1000000 + config: + # === Exploration === + learning_starts: 10000 + pure_exploration_steps: 10000 + + # === Evaluation === + evaluation_interval: 5 + evaluation_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml index f7a7c71f6..7122b577e 100644 --- a/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-apex-ddpg.yaml @@ -11,3 +11,5 @@ pendulum-apex-ddpg: n_step: 1 target_network_update_freq: 50000 tau: 1.0 + evaluation_interval: 5 + evaluation_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml index 38b93ea72..59891a86b 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml @@ -15,13 +15,14 @@ pendulum-ddpg: env_config: {} # === Exploration === + exploration_should_anneal: True schedule_max_timesteps: 100000 timesteps_per_iteration: 600 exploration_fraction: 0.1 - exploration_final_eps: 0.02 - noise_scale: 0.1 - exploration_theta: 0.15 - exploration_sigma: 0.2 + exploration_final_scale: 0.02 + exploration_ou_noise_scale: 0.1 + exploration_ou_theta: 0.15 + exploration_ou_sigma: 0.2 target_network_update_freq: 0 tau: 0.001 @@ -34,9 +35,8 @@ pendulum-ddpg: clip_rewards: False # === Optimization === - lr: 0.001 - actor_loss_coeff: 0.1 - critic_loss_coeff: 1.0 + actor_lr: 0.001 + critic_lr: 0.001 use_huber: True huber_threshold: 1.0 l2_reg: 0.000001 @@ -50,3 +50,7 @@ pendulum-ddpg: optimizer_class: "SyncReplayOptimizer" per_worker_exploration: False worker_side_prioritization: False + + # === Evaluation === + evaluation_interval: 5 + evaluation_num_episodes: 10 diff --git a/python/ray/rllib/tuned_examples/pendulum-td3.yaml b/python/ray/rllib/tuned_examples/pendulum-td3.yaml index 25b0900d6..77211cf7a 100644 --- a/python/ray/rllib/tuned_examples/pendulum-td3.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-td3.yaml @@ -1,60 +1,19 @@ # This configuration can expect to reach -160 reward in 10k-20k timesteps pendulum-ddpg: env: Pendulum-v0 - run: DDPG + run: TD3 stop: - episode_reward_mean: -160 - time_total_s: 600 # 10 minutes + episode_reward_mean: -130 + time_total_s: 900 # 10 minutes config: - # === Tricks === - twin_q: True - policy_delay: 2 - smooth_target_policy: True - act_noise: 0.1 - target_noise: 0.2 - noise_clip: 0.5 - # === Model === actor_hiddens: [64, 64] critic_hiddens: [64, 64] - n_step: 1 - model: {} - gamma: 0.99 - env_config: {} # === Exploration === - schedule_max_timesteps: 100000 - timesteps_per_iteration: 600 - exploration_fraction: 0.1 - exploration_final_eps: 0.02 - noise_scale: 0.1 - exploration_theta: 0.15 - exploration_sigma: 0.2 - target_network_update_freq: 0 - tau: 0.001 + learning_starts: 5000 + pure_exploration_steps: 5000 - # === Replay buffer === - buffer_size: 10000 - prioritized_replay: True - prioritized_replay_alpha: 0.6 - prioritized_replay_beta: 0.4 - prioritized_replay_eps: 0.000001 - clip_rewards: False - - # === Optimization === - lr: 0.001 - actor_loss_coeff: 0.1 - critic_loss_coeff: 1.0 - use_huber: True - huber_threshold: 1.0 - l2_reg: 0.000001 - learning_starts: 500 - sample_batch_size: 1 - train_batch_size: 64 - - # === Parallelism === - num_workers: 0 - num_gpus_per_worker: 0 - optimizer_class: "SyncReplayOptimizer" - per_worker_exploration: False - worker_side_prioritization: False + # === Evaluation === + evaluation_interval: 1 + evaluation_num_episodes: 5