diff --git a/ci/travis/ci.sh b/ci/travis/ci.sh index 18d312e96..2b0d79259 100755 --- a/ci/travis/ci.sh +++ b/ci/travis/ci.sh @@ -201,7 +201,7 @@ build_sphinx_docs() { if [ "${OSTYPE}" = msys ]; then echo "WARNING: Documentation not built on Windows due to currently-unresolved issues" else - sphinx-build -q -E -T -b html source _build/html + sphinx-build -q -E -W -T -b html source _build/html fi ) } diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index 2b72055ff..914bfccdf 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -210,12 +210,13 @@ PettingZoo Multi-Agent Environments `PettingZoo `__ is a repository of over 50 diverse multi-agent environments. However, the API is note directly compatible with rllib, but it can be converted into an rllib MultiAgentEnv like in this example .. code-block:: python + from ray.tune.registry import register_env # import the pettingzoo environment from pettingzoo.gamma import prison_v0 # import rllib pettingzoo interface from ray.rllib.env import PettingZooEnv - # define how to make the environment. This way takes an optinoal environment config, num_floors + # define how to make the environment. This way takes an optional environment config, num_floors env_creator = lambda config: prison_v0.env(num_floors=config.get("num_floors", 4)) # register that way to make the environment under an rllib name register_env('prison', lambda config: PettingZooEnv(env_creator(config))) diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 7bc829e51..eb60390bc 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -38,7 +38,7 @@ Then, you can try out training in the following equivalent ways: from ray import tune from ray.rllib.agents.ppo import PPOTrainer tune.run(PPOTrainer, config={"env": "CartPole-v0"}) # "log_level": "INFO" for verbose, - # "framework": "tfe" for tf-eager execution, + # "framework": "tfe" for tf-eager, # "framework": "torch" for PyTorch Next, we'll cover three key concepts in RLlib: Policies, Samples, and Trainers. diff --git a/python/ray/serve/api.py b/python/ray/serve/api.py index eb2daff43..bd2b5d25b 100644 --- a/python/ray/serve/api.py +++ b/python/ray/serve/api.py @@ -205,16 +205,16 @@ def update_backend_config(backend_tag, config_options): backend_tag(str): A registered backend. config_options(dict): Backend config options to update. Supported options: - - "num_replicas": number of worker processes to start up that \ - will handle requests to this backend. - - "max_batch_size": the maximum number of requests that will \ - be processed in one batch by this backend. - - "batch_wait_timeout": time in seconds that backend replicas \ - will wait for a full batch of requests before \ - processing a partial batch. - - "max_concurrent_queries": the maximum number of queries \ - that will be sent to a replica of this backend \ - without receiving a response. + - "num_replicas": number of worker processes to start up that + will handle requests to this backend. + - "max_batch_size": the maximum number of requests that will + be processed in one batch by this backend. + - "batch_wait_timeout": time in seconds that backend replicas + will wait for a full batch of requests before + processing a partial batch. + - "max_concurrent_queries": the maximum number of queries + that will be sent to a replica of this backend + without receiving a response. """ if not isinstance(config_options, dict): raise ValueError("config_options must be a dictionary.") @@ -252,16 +252,16 @@ def create_backend(backend_tag, @ray.remote decorator for the backend actor. config (optional): configuration options for this backend. Supported options: - - "num_replicas": number of worker processes to start up that \ - will handle requests to this backend. - - "max_batch_size": the maximum number of requests that will \ - be processed in one batch by this backend. - - "batch_wait_timeout": time in seconds that backend replicas \ - will wait for a full batch of requests before \ - processing a partial batch. - - "max_concurrent_queries": the maximum number of queries \ - that will be sent to a replica of this backend \ - without receiving a response. + - "num_replicas": number of worker processes to start up that will + handle requests to this backend. + - "max_batch_size": the maximum number of requests that will + be processed in one batch by this backend. + - "batch_wait_timeout": time in seconds that backend replicas + will wait for a full batch of requests before processing a + partial batch. + - "max_concurrent_queries": the maximum number of queries that will + be sent to a replica of this backend without receiving a + response. """ if config is None: config = {} diff --git a/rllib/BUILD b/rllib/BUILD index 38dca10b3..1585434d1 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -1148,10 +1148,21 @@ py_test( ) py_test( - name = "tests/test_eager_support", + name = "tests/test_eager_support_pg", + main = "tests/test_eager_support.py", tags = ["tests_dir", "tests_dir_E"], - size = "enormous", - srcs = ["tests/test_eager_support.py"] + size = "large", + srcs = ["tests/test_eager_support.py"], + args = ["TestEagerSupportPG"] +) + +py_test( + name = "tests/test_eager_support_off_policy", + main = "tests/test_eager_support.py", + tags = ["tests_dir", "tests_dir_E"], + size = "large", + srcs = ["tests/test_eager_support.py"], + args = ["TestEagerSupportOffPolicy"] ) py_test( @@ -1269,6 +1280,13 @@ py_test( srcs = ["tests/test_nested_observation_spaces.py"] ) +py_test( + name = "tests/test_pettingzoo_env", + tags = ["tests_dir", "tests_dir_P"], + size = "medium", + srcs = ["tests/test_pettingzoo_env.py"] +) + py_test( name = "tests/test_reproducibility", tags = ["tests_dir", "tests_dir_R"], @@ -1311,17 +1329,30 @@ py_test( ) py_test( - name = "tests/test_pettingzoo_env", + name = "tests/test_supported_spaces_pg", + main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], - size = "medium", - srcs = ["tests/test_pettingzoo_env.py"] + size = "enormous", + srcs = ["tests/test_supported_spaces.py"], + args = ["TestSupportedSpacesPG"] ) py_test( - name = "tests/test_supported_spaces", + name = "tests/test_supported_spaces_off_policy", + main = "tests/test_supported_spaces.py", tags = ["tests_dir", "tests_dir_S"], size = "enormous", - srcs = ["tests/test_supported_spaces.py"] + srcs = ["tests/test_supported_spaces.py"], + args = ["TestSupportedSpacesOffPolicy"] +) + +py_test( + name = "tests/test_supported_spaces_evolution_algos", + main = "tests/test_supported_spaces.py", + tags = ["tests_dir", "tests_dir_S"], + size = "large", + srcs = ["tests/test_supported_spaces.py"], + args = ["TestSupportedSpacesEvolutionAlgos"] ) # -------------------------------------------------------------------- diff --git a/rllib/agents/a3c/a3c_tf_policy.py b/rllib/agents/a3c/a3c_tf_policy.py index dde894cd9..a38eff247 100644 --- a/rllib/agents/a3c/a3c_tf_policy.py +++ b/rllib/agents/a3c/a3c_tf_policy.py @@ -55,7 +55,7 @@ def postprocess_advantages(policy, else: next_state = [] for i in range(policy.num_state_tensors()): - next_state.append([sample_batch["state_out_{}".format(i)][-1]]) + next_state.append(sample_batch["state_out_{}".format(i)][-1]) last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1], sample_batch[SampleBatch.ACTIONS][-1], sample_batch[SampleBatch.REWARDS][-1], diff --git a/rllib/agents/a3c/tests/test_a2c.py b/rllib/agents/a3c/tests/test_a2c.py index 7c6a559dc..08abcee8a 100644 --- a/rllib/agents/a3c/tests/test_a2c.py +++ b/rllib/agents/a3c/tests/test_a2c.py @@ -24,31 +24,34 @@ class TestA2C(unittest.TestCase): num_iterations = 1 # Test against all frameworks. - for fw in framework_iterator(config, ("tf", "torch")): - config["sample_async"] = fw == "tf" + for fw in framework_iterator(config): + config["sample_async"] = fw in ["tf", "tfe"] for env in ["PongDeterministic-v0"]: trainer = a3c.A2CTrainer(config=config, env=env) for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) + trainer.stop() def test_a2c_exec_impl(ray_start_regular): config = {"min_iter_time_s": 0} - for _ in framework_iterator(config, ("tf", "torch")): + for _ in framework_iterator(config): trainer = a3c.A2CTrainer(env="CartPole-v0", config=config) assert isinstance(trainer.train(), dict) check_compute_single_action(trainer) + trainer.stop() def test_a2c_exec_impl_microbatch(ray_start_regular): config = { "min_iter_time_s": 0, "microbatch_size": 10, } - for _ in framework_iterator(config, ("tf", "torch")): + for _ in framework_iterator(config): trainer = a3c.A2CTrainer(env="CartPole-v0", config=config) assert isinstance(trainer.train(), dict) check_compute_single_action(trainer) + trainer.stop() if __name__ == "__main__": diff --git a/rllib/agents/a3c/tests/test_a3c.py b/rllib/agents/a3c/tests/test_a3c.py index daa474009..214d49a1a 100644 --- a/rllib/agents/a3c/tests/test_a3c.py +++ b/rllib/agents/a3c/tests/test_a3c.py @@ -24,7 +24,7 @@ class TestA3C(unittest.TestCase): num_iterations = 1 # Test against all frameworks. - for fw in framework_iterator(config, ("tf", "torch")): + for fw in framework_iterator(config): config["sample_async"] = fw == "tf" for env in ["CartPole-v0", "Pendulum-v0", "PongDeterministic-v0"]: trainer = a3c.A3CTrainer(config=config, env=env) @@ -32,6 +32,7 @@ class TestA3C(unittest.TestCase): results = trainer.train() print(results) check_compute_single_action(trainer) + trainer.stop() if __name__ == "__main__": diff --git a/rllib/agents/ddpg/apex.py b/rllib/agents/ddpg/apex.py index 145997a75..50e2d199d 100644 --- a/rllib/agents/ddpg/apex.py +++ b/rllib/agents/ddpg/apex.py @@ -27,7 +27,14 @@ APEX_DDPG_DEFAULT_CONFIG = DDPGTrainer.merge_trainer_configs( }, ) + +def validate_config(config): + if config.get("framework") == "tfe": + raise ValueError("APEX_DDPG does not support tf-eager yet!") + + ApexDDPGTrainer = DDPGTrainer.with_updates( name="APEX_DDPG", default_config=APEX_DDPG_DEFAULT_CONFIG, + validate_config=validate_config, execution_plan=apex_execution_plan) diff --git a/rllib/agents/ddpg/common/__init__.py b/rllib/agents/ddpg/common/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/rllib/agents/ddpg/ddpg.py b/rllib/agents/ddpg/ddpg.py index 9d6e0b889..e220ca462 100644 --- a/rllib/agents/ddpg/ddpg.py +++ b/rllib/agents/ddpg/ddpg.py @@ -5,8 +5,6 @@ from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer from ray.rllib.agents.ddpg.ddpg_tf_policy import DDPGTFPolicy from ray.rllib.utils.deprecation import deprecation_warning, \ DEPRECATED_VALUE -from ray.rllib.utils.exploration.per_worker_ornstein_uhlenbeck_noise import \ - PerWorkerOrnsteinUhlenbeckNoise logger = logging.getLogger(__name__) @@ -129,7 +127,7 @@ DEFAULT_CONFIG = with_common_config({ # Weights for L2 regularization "l2_reg": 1e-6, # If not None, clip gradients during optimization at this value - "grad_norm_clipping": None, + "grad_clip": None, # How many steps of the model to sample before learning starts. "learning_starts": 1500, # Update the replay buffer with this many samples at once. Note that this @@ -151,7 +149,7 @@ DEFAULT_CONFIG = with_common_config({ "min_iter_time_s": 1, # Deprecated keys. - "parameter_noise": DEPRECATED_VALUE, + "grad_norm_clipping": DEPRECATED_VALUE, }) # __sphinx_doc_end__ # yapf: enable @@ -164,41 +162,12 @@ def validate_config(config): "was specified.") config["use_state_preprocessor"] = True - # TODO(sven): Remove at some point. - # Backward compatibility of noise-based exploration config. - schedule_max_timesteps = None - if config.get("schedule_max_timesteps", DEPRECATED_VALUE) != \ - DEPRECATED_VALUE: - deprecation_warning("schedule_max_timesteps", - "exploration_config.scale_timesteps") - schedule_max_timesteps = config["schedule_max_timesteps"] - if config.get("exploration_final_scale", DEPRECATED_VALUE) != \ - DEPRECATED_VALUE: - deprecation_warning("exploration_final_scale", - "exploration_config.final_scale") - if isinstance(config["exploration_config"], dict): - config["exploration_config"]["final_scale"] = \ - config.pop("exploration_final_scale") - if config.get("exploration_fraction", DEPRECATED_VALUE) != \ - DEPRECATED_VALUE: - assert schedule_max_timesteps is not None - deprecation_warning("exploration_fraction", - "exploration_config.scale_timesteps") - if isinstance(config["exploration_config"], dict): - config["exploration_config"]["scale_timesteps"] = config.pop( - "exploration_fraction") * schedule_max_timesteps - if config.get("per_worker_exploration", DEPRECATED_VALUE) != \ - DEPRECATED_VALUE: - deprecation_warning( - "per_worker_exploration", - "exploration_config.type=PerWorkerOrnsteinUhlenbeckNoise") - if isinstance(config["exploration_config"], dict): - config["exploration_config"]["type"] = \ - PerWorkerOrnsteinUhlenbeckNoise + if config.get("grad_norm_clipping", DEPRECATED_VALUE) != DEPRECATED_VALUE: + deprecation_warning("grad_norm_clipping", "grad_clip") + config["grad_clip"] = config.pop("grad_norm_clipping") - if config.get("parameter_noise", DEPRECATED_VALUE) != DEPRECATED_VALUE: - deprecation_warning("parameter_noise", "exploration_config={" - "type=ParameterNoise}") + if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: + raise ValueError("`grad_clip` value must be > 0.0!") if config["exploration_config"]["type"] == "ParameterNoise": if config["batch_mode"] != "complete_episodes": diff --git a/rllib/agents/ddpg/ddpg_tf_policy.py b/rllib/agents/ddpg/ddpg_tf_policy.py index 027ccba26..dc42a2af2 100644 --- a/rllib/agents/ddpg/ddpg_tf_policy.py +++ b/rllib/agents/ddpg/ddpg_tf_policy.py @@ -18,22 +18,13 @@ from ray.rllib.utils.annotations import override from ray.rllib.policy.tf_policy import TFPolicy from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.error import UnsupportedSpaceException -from ray.rllib.utils.framework import try_import_tf -from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \ - make_tf_callable +from ray.rllib.utils.framework import get_variable, try_import_tf +from ray.rllib.utils.tf_ops import huber_loss, make_tf_callable tf1, tf, tfv = try_import_tf() logger = logging.getLogger(__name__) -ACTION_SCOPE = "action" -POLICY_SCOPE = "policy" -POLICY_TARGET_SCOPE = "target_policy" -Q_SCOPE = "critic" -Q_TARGET_SCOPE = "target_critic" -TWIN_Q_SCOPE = "twin_critic" -TWIN_Q_TARGET_SCOPE = "twin_target_critic" - def build_ddpg_models(policy, observation_space, action_space, config): if policy.config["use_state_preprocessor"]: @@ -126,59 +117,45 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch): target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None) # Policy network evaluation. - with tf1.variable_scope(POLICY_SCOPE, reuse=True): - # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - policy_t = model.get_policy_output(model_out_t) - # policy_batchnorm_update_ops = list( - # set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) - - with tf1.variable_scope(POLICY_TARGET_SCOPE): - policy_tp1 = \ - policy.target_model.get_policy_output(target_model_out_tp1) + policy_t = model.get_policy_output(model_out_t) + policy_tp1 = \ + policy.target_model.get_policy_output(target_model_out_tp1) # Action outputs. - with tf1.variable_scope(ACTION_SCOPE, reuse=True): - if policy.config["smooth_target_policy"]: - target_noise_clip = policy.config["target_noise_clip"] - clipped_normal_sample = tf.clip_by_value( - tf.random.normal( - tf.shape(policy_tp1), - stddev=policy.config["target_noise"]), -target_noise_clip, - target_noise_clip) - policy_tp1_smoothed = tf.clip_by_value( - policy_tp1 + clipped_normal_sample, - policy.action_space.low * tf.ones_like(policy_tp1), - policy.action_space.high * tf.ones_like(policy_tp1)) - else: - # No smoothing, just use deterministic actions. - policy_tp1_smoothed = policy_tp1 + if policy.config["smooth_target_policy"]: + target_noise_clip = policy.config["target_noise_clip"] + clipped_normal_sample = tf.clip_by_value( + tf.random.normal( + tf.shape(policy_tp1), + stddev=policy.config["target_noise"]), -target_noise_clip, + target_noise_clip) + policy_tp1_smoothed = tf.clip_by_value( + policy_tp1 + clipped_normal_sample, + policy.action_space.low * tf.ones_like(policy_tp1), + policy.action_space.high * tf.ones_like(policy_tp1)) + else: + # No smoothing, just use deterministic actions. + policy_tp1_smoothed = policy_tp1 # Q-net(s) evaluation. - # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - with tf1.variable_scope(Q_SCOPE): - # Q-values for given actions & observations in given current - q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) + # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) + # Q-values for given actions & observations in given current + q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS]) - with tf1.variable_scope(Q_SCOPE, reuse=True): - # Q-values for current policy (no noise) in given current state - q_t_det_policy = model.get_q_values(model_out_t, policy_t) + # Q-values for current policy (no noise) in given current state + q_t_det_policy = model.get_q_values(model_out_t, policy_t) if twin_q: - with tf1.variable_scope(TWIN_Q_SCOPE): - twin_q_t = model.get_twin_q_values( - model_out_t, train_batch[SampleBatch.ACTIONS]) - # q_batchnorm_update_ops = list( - # set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) + twin_q_t = model.get_twin_q_values( + model_out_t, train_batch[SampleBatch.ACTIONS]) # Target q-net(s) evaluation. - with tf1.variable_scope(Q_TARGET_SCOPE): - q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, - policy_tp1_smoothed) + q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, + policy_tp1_smoothed) if twin_q: - with tf1.variable_scope(TWIN_Q_TARGET_SCOPE): - twin_q_tp1 = policy.target_model.get_twin_q_values( - target_model_out_tp1, policy_tp1_smoothed) + twin_q_tp1 = policy.target_model.get_twin_q_values( + target_model_out_tp1, policy_tp1_smoothed) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) if twin_q: @@ -220,10 +197,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch): if l2_reg is not None: for var in policy.model.policy_variables(): if "bias" not in var.name: - actor_loss += (l2_reg * tf1.nn.l2_loss(var)) + actor_loss += (l2_reg * tf.nn.l2_loss(var)) for var in policy.model.q_variables(): if "bias" not in var.name: - critic_loss += (l2_reg * tf1.nn.l2_loss(var)) + critic_loss += (l2_reg * tf.nn.l2_loss(var)) # Model self-supervised losses. if policy.config["use_state_preprocessor"]: @@ -259,28 +236,18 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch): def make_ddpg_optimizers(policy, config): # Create separate optimizers for actor & critic losses. - policy._actor_optimizer = tf1.train.AdamOptimizer( - learning_rate=config["actor_lr"]) - policy._critic_optimizer = tf1.train.AdamOptimizer( - learning_rate=config["critic_lr"]) + if tfv == 2 and config["framework"] == "tfe": + policy._actor_optimizer = tf.keras.optimizers.Adam( + learning_rate=config["actor_lr"]) + policy._critic_optimizer = tf.keras.optimizers.Adam( + learning_rate=config["critic_lr"]) + else: + policy._actor_optimizer = tf1.train.AdamOptimizer( + learning_rate=config["actor_lr"]) + policy._critic_optimizer = tf1.train.AdamOptimizer( + learning_rate=config["critic_lr"]) return None - # TFPolicy.__init__( - # self, - # observation_space, - # action_space, - # self.config, - # self.sess, - # #obs_input=self.cur_observations, - # sampled_action=self.output_actions, - # loss=self.actor_loss + self.critic_loss, - # loss_inputs=self.loss_inputs, - # update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops, - # explore=explore, - # dist_inputs=self._distribution_inputs, - # dist_class=Deterministic, - # timestep=timestep) - def build_apply_op(policy, optimizer, grads_and_vars): # For policy gradient, update policy net one time v.s. @@ -299,34 +266,44 @@ def build_apply_op(policy, optimizer, grads_and_vars): critic_op = policy._critic_optimizer.apply_gradients( policy._critic_grads_and_vars) # Increment global step & apply ops. - with tf1.control_dependencies([tf1.assign_add(policy.global_step, 1)]): - return tf.group(actor_op, critic_op) + if tfv == 2 and policy.config["framework"] == "tfe": + policy.global_step.assign_add(1) + return tf.no_op() + else: + with tf1.control_dependencies([tf1.assign_add(policy.global_step, 1)]): + return tf.group(actor_op, critic_op) def gradients_fn(policy, optimizer, loss): - if policy.config["grad_norm_clipping"] is not None: - actor_grads_and_vars = minimize_and_clip( - policy._actor_optimizer, - policy.actor_loss, - var_list=policy.model.policy_variables(), - clip_val=policy.config["grad_norm_clipping"]) - critic_grads_and_vars = minimize_and_clip( - policy._critic_optimizer, - policy.critic_loss, - var_list=policy.model.q_variables(), - clip_val=policy.config["grad_norm_clipping"]) + if policy.config["framework"] == "tfe": + tape = optimizer.tape + pol_weights = policy.model.policy_variables() + actor_grads_and_vars = list(zip(tape.gradient( + policy.actor_loss, pol_weights), pol_weights)) + q_weights = policy.model.q_variables() + critic_grads_and_vars = list(zip(tape.gradient( + policy.critic_loss, q_weights), q_weights)) else: actor_grads_and_vars = policy._actor_optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) critic_grads_and_vars = policy._critic_optimizer.compute_gradients( policy.critic_loss, var_list=policy.model.q_variables()) - # Save these for later use in build_apply_op. - policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars - if g is not None] - policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars - if g is not None] + + # Clip if necessary. + if policy.config["grad_clip"]: + clip_func = tf.clip_by_norm + else: + clip_func = tf.identity + + # Save grads and vars for later use in `build_apply_op`. + policy._actor_grads_and_vars = [ + (clip_func(g), v) for (g, v) in actor_grads_and_vars if g is not None] + policy._critic_grads_and_vars = [ + (clip_func(g), v) for (g, v) in critic_grads_and_vars if g is not None] + grads_and_vars = policy._actor_grads_and_vars + \ policy._critic_grads_and_vars + return grads_and_vars @@ -341,7 +318,10 @@ def build_ddpg_stats(policy, batch): def before_init_fn(policy, obs_space, action_space, config): # Create global step for counting the number of update operations. - policy.global_step = tf1.train.get_or_create_global_step() + if tfv == 2 and config["framework"] == "tfe": + policy.global_step = get_variable(0, tf_name="global_step") + else: + policy.global_step = tf1.train.get_or_create_global_step() class ComputeTDErrorMixin: diff --git a/rllib/agents/ddpg/tests/test_apex_ddpg.py b/rllib/agents/ddpg/tests/test_apex_ddpg.py index 4a8142bc4..7df44c1ad 100644 --- a/rllib/agents/ddpg/tests/test_apex_ddpg.py +++ b/rllib/agents/ddpg/tests/test_apex_ddpg.py @@ -24,7 +24,7 @@ class TestApexDDPG(unittest.TestCase): config["learning_starts"] = 0 config["optimizer"]["num_replay_buffer_shards"] = 1 num_iterations = 1 - for _ in framework_iterator(config, ("torch", "tf")): + for _ in framework_iterator(config, frameworks=("tf", "torch")): plain_config = config.copy() trainer = apex_ddpg.ApexDDPGTrainer( config=plain_config, env="Pendulum-v0") diff --git a/rllib/agents/ddpg/tests/test_ddpg.py b/rllib/agents/ddpg/tests/test_ddpg.py index de551cbd6..a3a1180d4 100644 --- a/rllib/agents/ddpg/tests/test_ddpg.py +++ b/rllib/agents/ddpg/tests/test_ddpg.py @@ -35,15 +35,16 @@ class TestDDPG(unittest.TestCase): config["learning_starts"] = 0 config["exploration_config"]["random_timesteps"] = 100 - num_iterations = 2 + num_iterations = 1 # Test against all frameworks. - for _ in framework_iterator(config, ("tf", "torch")): + for _ in framework_iterator(config): trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) + trainer.stop() def test_ddpg_exploration_and_with_random_prerun(self): """Tests DDPG's Exploration (w/ random actions for n timesteps).""" @@ -52,7 +53,7 @@ class TestDDPG(unittest.TestCase): obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. - for _ in framework_iterator(core_config, ("torch", "tf")): + for _ in framework_iterator(core_config): config = core_config.copy() # Default OUNoise setup. trainer = ddpg.DDPGTrainer(config=config, env="Pendulum-v0") @@ -66,6 +67,7 @@ class TestDDPG(unittest.TestCase): for _ in range(50): actions.append(trainer.compute_action(obs)) check(np.std(actions), 0.0, false=True) + trainer.stop() # Check randomness at beginning. config["exploration_config"] = { @@ -95,6 +97,7 @@ class TestDDPG(unittest.TestCase): for _ in range(50): a = trainer.compute_action(obs, explore=False) check(a, deterministic_action) + trainer.stop() def test_ddpg_loss_function(self): """Tests DDPG loss function results across all frameworks.""" diff --git a/rllib/agents/ddpg/tests/test_td3.py b/rllib/agents/ddpg/tests/test_td3.py index 1c0356278..9f33e91b0 100644 --- a/rllib/agents/ddpg/tests/test_td3.py +++ b/rllib/agents/ddpg/tests/test_td3.py @@ -16,13 +16,14 @@ class TestTD3(unittest.TestCase): config["num_workers"] = 0 # Run locally. # Test against all frameworks. - for _ in framework_iterator(config, frameworks=["tf"]): + for _ in framework_iterator(config): trainer = td3.TD3Trainer(config=config, env="Pendulum-v0") - num_iterations = 2 + num_iterations = 1 for i in range(num_iterations): results = trainer.train() print(results) check_compute_single_action(trainer) + trainer.stop() def test_td3_exploration_and_with_random_prerun(self): """Tests TD3's Exploration (w/ random actions for n timesteps).""" @@ -31,7 +32,7 @@ class TestTD3(unittest.TestCase): obs = np.array([0.0, 0.1, -0.1]) # Test against all frameworks. - for _ in framework_iterator(config, frameworks="tf"): + for _ in framework_iterator(config): lcl_config = config.copy() # Default GaussianNoise setup. trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0") diff --git a/rllib/agents/sac/sac.py b/rllib/agents/sac/sac.py index 43c33c639..eeca221c2 100644 --- a/rllib/agents/sac/sac.py +++ b/rllib/agents/sac/sac.py @@ -146,6 +146,9 @@ def validate_config(config): deprecation_warning("grad_norm_clipping", "grad_clip") config["grad_clip"] = config.pop("grad_norm_clipping") + if config["grad_clip"] is not None and config["grad_clip"] <= 0.0: + raise ValueError("`grad_clip` value must be > 0.0!") + # Use same keys as for standard Trainer "model" config. for model in ["Q_model", "policy_model"]: if config[model].get("hidden_activation", DEPRECATED_VALUE) != \ diff --git a/rllib/agents/sac/sac_tf_policy.py b/rllib/agents/sac/sac_tf_policy.py index 49076ac48..84ddf6153 100644 --- a/rllib/agents/sac/sac_tf_policy.py +++ b/rllib/agents/sac/sac_tf_policy.py @@ -15,7 +15,6 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy from ray.rllib.utils.error import UnsupportedSpaceException from ray.rllib.utils.framework import try_import_tf, try_import_tfp -from ray.rllib.utils.tf_ops import minimize_and_clip tf1, tf, tfv = try_import_tf() tfp = try_import_tfp() @@ -227,15 +226,14 @@ def sac_actor_critic_loss(policy, model, _, train_batch): td_error = base_td_error critic_loss = [ - tf1.losses.mean_squared_error( - labels=q_t_selected_target, predictions=q_t_selected, weights=0.5) + 0.5 * tf.keras.losses.MSE( + y_true=q_t_selected_target, y_pred=q_t_selected) ] if policy.config["twin_q"]: critic_loss.append( - tf1.losses.mean_squared_error( - labels=q_t_selected_target, - predictions=twin_q_t_selected, - weights=0.5)) + 0.5 * tf.keras.losses.MSE( + y_true=q_t_selected_target, + y_pred=twin_q_t_selected)) # Alpha- and actor losses. # Note: In the papers, alpha is used directly, here we take the log. @@ -277,63 +275,64 @@ def sac_actor_critic_loss(policy, model, _, train_batch): return actor_loss + tf.math.add_n(critic_loss) + alpha_loss -def gradients(policy, optimizer, loss): - if policy.config["grad_clip"]: - actor_grads_and_vars = minimize_and_clip( - optimizer, # isn't optimizer not well defined here (which one)? - policy.actor_loss, - var_list=policy.model.policy_variables(), - clip_val=policy.config["grad_clip"]) +def gradients_fn(policy, optimizer, loss): + # Eager: Use GradientTape. + if policy.config["framework"] == "tfe": + tape = optimizer.tape + pol_weights = policy.model.policy_variables() + actor_grads_and_vars = list(zip(tape.gradient( + policy.actor_loss, pol_weights), pol_weights)) + q_weights = policy.model.q_variables() if policy.config["twin_q"]: - q_variables = policy.model.q_variables() - half_cutoff = len(q_variables) // 2 - critic_grads_and_vars = [] - critic_grads_and_vars += minimize_and_clip( - optimizer, - policy.critic_loss[0], - var_list=q_variables[:half_cutoff], - clip_val=policy.config["grad_clip"]) - critic_grads_and_vars += minimize_and_clip( - optimizer, - policy.critic_loss[1], - var_list=q_variables[half_cutoff:], - clip_val=policy.config["grad_clip"]) + half_cutoff = len(q_weights) // 2 + grads_1 = tape.gradient( + policy.critic_loss[0], q_weights[:half_cutoff]) + grads_2 = tape.gradient( + policy.critic_loss[1], q_weights[half_cutoff:]) + critic_grads_and_vars = \ + list(zip(grads_1, q_weights[:half_cutoff])) + \ + list(zip(grads_2, q_weights[half_cutoff:])) else: - critic_grads_and_vars = minimize_and_clip( - optimizer, - policy.critic_loss[0], - var_list=policy.model.q_variables(), - clip_val=policy.config["grad_clip"]) - alpha_grads_and_vars = minimize_and_clip( - optimizer, - policy.alpha_loss, - var_list=[policy.model.log_alpha], - clip_val=policy.config["grad_clip"]) + critic_grads_and_vars = list(zip(tape.gradient( + policy.critic_loss[0], q_weights), q_weights)) + + alpha_vars = [policy.model.log_alpha] + alpha_grads_and_vars = list(zip(tape.gradient( + policy.alpha_loss, alpha_vars), alpha_vars)) + # Tf1.x: Use optimizer.compute_gradients() else: actor_grads_and_vars = policy._actor_optimizer.compute_gradients( policy.actor_loss, var_list=policy.model.policy_variables()) + + q_weights = policy.model.q_variables() if policy.config["twin_q"]: - q_variables = policy.model.q_variables() - half_cutoff = len(q_variables) // 2 + half_cutoff = len(q_weights) // 2 base_q_optimizer, twin_q_optimizer = policy._critic_optimizer critic_grads_and_vars = base_q_optimizer.compute_gradients( - policy.critic_loss[0], var_list=q_variables[:half_cutoff] + policy.critic_loss[0], var_list=q_weights[:half_cutoff] ) + twin_q_optimizer.compute_gradients( - policy.critic_loss[1], var_list=q_variables[half_cutoff:]) + policy.critic_loss[1], var_list=q_weights[half_cutoff:]) else: critic_grads_and_vars = policy._critic_optimizer[ 0].compute_gradients( - policy.critic_loss[0], var_list=policy.model.q_variables()) + policy.critic_loss[0], var_list=q_weights) alpha_grads_and_vars = policy._alpha_optimizer.compute_gradients( policy.alpha_loss, var_list=[policy.model.log_alpha]) - # save these for later use in build_apply_op - policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars - if g is not None] - policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars - if g is not None] - policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars - if g is not None] + # Clip if necessary. + if policy.config["grad_clip"]: + clip_func = tf.clip_by_norm + else: + clip_func = tf.identity + + # Save grads and vars for later use in `build_apply_op`. + policy._actor_grads_and_vars = [ + (clip_func(g), v) for (g, v) in actor_grads_and_vars if g is not None] + policy._critic_grads_and_vars = [ + (clip_func(g), v) for (g, v) in critic_grads_and_vars if g is not None] + policy._alpha_grads_and_vars = [ + (clip_func(g), v) for (g, v) in alpha_grads_and_vars if g is not None] + grads_and_vars = ( policy._actor_grads_and_vars + policy._critic_grads_and_vars + policy._alpha_grads_and_vars) @@ -431,7 +430,7 @@ SACTFPolicy = build_tf_policy( action_distribution_fn=get_distribution_inputs_and_class, loss_fn=sac_actor_critic_loss, stats_fn=stats, - gradients_fn=gradients, + gradients_fn=gradients_fn, apply_gradients_fn=apply_gradients, extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error}, mixins=[ diff --git a/rllib/agents/sac/tests/test_sac.py b/rllib/agents/sac/tests/test_sac.py index b941b974a..785167e31 100644 --- a/rllib/agents/sac/tests/test_sac.py +++ b/rllib/agents/sac/tests/test_sac.py @@ -54,7 +54,7 @@ class TestSAC(unittest.TestCase): config["learning_starts"] = 0 config["prioritized_replay"] = True num_iterations = 1 - for _ in framework_iterator(config, ("tf", "torch")): + for _ in framework_iterator(config): # Test for different env types (discrete w/ and w/o image, + cont). for env in [ "Pendulum-v0", "MsPacmanNoFrameskip-v4", "CartPole-v0" diff --git a/rllib/env/pettingzoo_env.py b/rllib/env/pettingzoo_env.py index d9ee80904..e9b18fa20 100644 --- a/rllib/env/pettingzoo_env.py +++ b/rllib/env/pettingzoo_env.py @@ -3,6 +3,7 @@ from .multi_agent_env import MultiAgentEnv class PettingZooEnv(MultiAgentEnv): """An interface to the PettingZoo MARL environment library. + See: https://github.com/PettingZoo-Team/PettingZoo Inherits from MultiAgentEnv and exposes a given AEC @@ -31,33 +32,32 @@ class PettingZooEnv(MultiAgentEnv): >>> env = POMGameEnv(env_creator=prison_v0}) >>> obs = env.reset() >>> print(obs) - - { - "0": [110, 119], - "1": [105, 102], - "2": [99, 95], - } + { + "0": [110, 119], + "1": [105, 102], + "2": [99, 95], + } >>> obs, rewards, dones, infos = env.step( action_dict={ "0": 1, "1": 0, "2": 2, }) >>> print(rewards) - { - "0": 0, - "1": 1, - "2": 0, - } + { + "0": 0, + "1": 1, + "2": 0, + } >>> print(dones) - { - "0": False, # agent 0 is still running - "1": True, # agent 1 is done - "__all__": False, # the env is not done - } + { + "0": False, # agent 0 is still running + "1": True, # agent 1 is done + "__all__": False, # the env is not done + } >>> print(infos) - { - "0": {}, # info for agent 0 - "1": {}, # info for agent 1 - } + { + "0": {}, # info for agent 0 + "1": {}, # info for agent 1 + } """ def __init__(self, env): diff --git a/rllib/env/unity3d_env.py b/rllib/env/unity3d_env.py index 809b94de9..45a374cde 100644 --- a/rllib/env/unity3d_env.py +++ b/rllib/env/unity3d_env.py @@ -103,19 +103,18 @@ class Unity3DEnv(MultiAgentEnv): Args: action_dict (dict): Multi-agent action dict with: keys=agent identifier consisting of - [MLagents behavior name, e.g. "Goalie?team=1"] + "_" + - [Agent index, a unique MLAgent-assigned index per single - agent] + [MLagents behavior name, e.g. "Goalie?team=1"] + "_" + + [Agent index, a unique MLAgent-assigned index per single agent] Returns: tuple: - obs: Multi-agent observation dict. + - obs: Multi-agent observation dict. Only those observations for which to get new actions are returned. - rewards: Rewards dict matching `obs`. - dones: Done dict with only an __all__ multi-agent entry in it. - __all__=True, if episode is done for all agents. - infos: An (empty) info dict. + - rewards: Rewards dict matching `obs`. + - dones: Done dict with only an __all__ multi-agent entry in + it. __all__=True, if episode is done for all agents. + - infos: An (empty) info dict. """ # Set only the required actions (from the DecisionSteps) in Unity3D. diff --git a/rllib/policy/eager_tf_policy.py b/rllib/policy/eager_tf_policy.py index 9d0f3377b..13d9daf8f 100644 --- a/rllib/policy/eager_tf_policy.py +++ b/rllib/policy/eager_tf_policy.py @@ -325,16 +325,15 @@ def build_eager_tf_policy(name, self._is_training = False self._state_in = state_batches - if tf.executing_eagerly(): - n = len(obs_batch) - else: - n = obs_batch.shape[0] - seq_lens = tf.ones(n, dtype=tf.int32) + if not tf1.executing_eagerly(): + tf1.enable_eager_execution() input_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(obs_batch), "is_training": tf.constant(False), } + n = input_dict[SampleBatch.CUR_OBS].shape[0] + seq_lens = tf.ones(n, dtype=tf.int32) if obs_include_prev_action_reward: if prev_action_batch is not None: input_dict[SampleBatch.PREV_ACTIONS] = \ diff --git a/rllib/policy/policy.py b/rllib/policy/policy.py index 58b46bf55..a0d17c36d 100644 --- a/rllib/policy/policy.py +++ b/rllib/policy/policy.py @@ -154,14 +154,14 @@ class Policy(metaclass=ABCMeta): timestep (Optional[int]): The current (sampling) time step. Keyword Args: - kwargs: forward compatibility placeholder + kwargs: Forward compatibility. Returns: Tuple: - actions (TensorType): Single action. - state_outs (List[TensorType]): List of RNN state outputs, + - actions (TensorType): Single action. + - state_outs (List[TensorType]): List of RNN state outputs, if any. - info (dict): Dictionary of extra features, if any. + - info (dict): Dictionary of extra features, if any. """ prev_action_batch = None prev_reward_batch = None diff --git a/rllib/policy/tests/test_compute_log_likelihoods.py b/rllib/policy/tests/test_compute_log_likelihoods.py index 10fa7d705..da83ae10d 100644 --- a/rllib/policy/tests/test_compute_log_likelihoods.py +++ b/rllib/policy/tests/test_compute_log_likelihoods.py @@ -38,9 +38,6 @@ def do_test_log_likelihood(run, # Test against all frameworks. for fw in framework_iterator(config): - if run in [sac.SACTrainer] and fw == "tfe": - continue - trainer = run(config=config, env=env) policy = trainer.get_policy() @@ -171,7 +168,7 @@ class TestComputeLogLikelihood(unittest.TestCase): config, prev_a, continuous=True, - layer_key=("sequential/action", (0, 2), + layer_key=("sequential/action", (2, 4), ("action_model.action_0.", "action_model.action_out.")), logp_func=logp_func) diff --git a/rllib/tests/test_eager_support.py b/rllib/tests/test_eager_support.py index 07a2e1a4f..7bda4a003 100644 --- a/rllib/tests/test_eager_support.py +++ b/rllib/tests/test_eager_support.py @@ -5,8 +5,9 @@ from ray import tune from ray.rllib.agents.registry import get_agent_class -def check_support(alg, config, test_trace=True): +def check_support(alg, config, test_eager=False, test_trace=True): config["framework"] = "tfe" + config["log_level"] = "ERROR" # Test both continuous and discrete actions. for cont in [True, False]: if cont and alg in ["DQN", "APEX", "SimpleQ"]: @@ -14,46 +15,31 @@ def check_support(alg, config, test_trace=True): elif not cont and alg in ["DDPG", "APEX_DDPG", "TD3"]: continue - print("run={} cont. actions={}".format(alg, cont)) - if cont: config["env"] = "Pendulum-v0" else: config["env"] = "CartPole-v0" a = get_agent_class(alg) - config["log_level"] = "ERROR" - config["eager_tracing"] = False - tune.run(a, config=config, stop={"training_iteration": 1}) - + if test_eager: + print("tf-eager: alg={} cont.act={}".format(alg, cont)) + config["eager_tracing"] = False + tune.run( + a, config=config, stop={"training_iteration": 1}, verbose=1) if test_trace: config["eager_tracing"] = True - tune.run(a, config=config, stop={"training_iteration": 1}) + print("tf-eager-tracing: alg={} cont.act={}".format(alg, cont)) + tune.run( + a, config=config, stop={"training_iteration": 1}, verbose=1) -class TestEagerSupport(unittest.TestCase): +class TestEagerSupportPG(unittest.TestCase): def setUp(self): - ray.init(num_cpus=4, local_mode=True) + ray.init(num_cpus=4) def tearDown(self): ray.shutdown() - def test_simple_q(self): - check_support("SimpleQ", {"num_workers": 0, "learning_starts": 0}) - - def test_dqn(self): - check_support("DQN", {"num_workers": 0, "learning_starts": 0}) - - # TODO(sven): Add these once DDPG supports eager. - # def test_ddpg(self): - # check_support("DDPG", {"num_workers": 0}) - - # def test_apex_ddpg(self): - # check_support("APEX_DDPG", {"num_workers": 1}) - - # def test_td3(self): - # check_support("TD3", {"num_workers": 0}) - def test_a2c(self): check_support("A2C", {"num_workers": 0}) @@ -70,7 +56,31 @@ class TestEagerSupport(unittest.TestCase): check_support("APPO", {"num_workers": 1, "num_gpus": 0}) def test_impala(self): - check_support("IMPALA", {"num_workers": 1, "num_gpus": 0}) + check_support( + "IMPALA", {"num_workers": 1, "num_gpus": 0}, test_eager=True) + + +class TestEagerSupportOffPolicy(unittest.TestCase): + def setUp(self): + ray.init(num_cpus=4) + + def tearDown(self): + ray.shutdown() + + def test_simple_q(self): + check_support("SimpleQ", {"num_workers": 0, "learning_starts": 0}) + + def test_dqn(self): + check_support("DQN", {"num_workers": 0, "learning_starts": 0}) + + def test_ddpg(self): + check_support("DDPG", {"num_workers": 0}) + + # def test_apex_ddpg(self): + # check_support("APEX_DDPG", {"num_workers": 1}) + + def test_td3(self): + check_support("TD3", {"num_workers": 0}) def test_apex_dqn(self): check_support( @@ -85,12 +95,15 @@ class TestEagerSupport(unittest.TestCase): }, }) - # TODO(sven): Add this once SAC supports eager. - # def test_sac(self): - # check_support("SAC", {"num_workers": 0, "learning_starts": 0}) + def test_sac(self): + check_support("SAC", {"num_workers": 0, "learning_starts": 0}) if __name__ == "__main__": import pytest import sys - sys.exit(pytest.main(["-v", __file__])) + # One can specify the specific TestCase class to run. + # None for all unittest.TestCase classes in this file. + class_ = sys.argv[1] if len(sys.argv) > 0 else None + sys.exit(pytest.main( + ["-v", __file__ + ("" if class_ is None else "::" + class_)])) diff --git a/rllib/tests/test_supported_multi_agent.py b/rllib/tests/test_supported_multi_agent.py index 8298fda3b..abc5f1cf7 100644 --- a/rllib/tests/test_supported_multi_agent.py +++ b/rllib/tests/test_supported_multi_agent.py @@ -14,7 +14,9 @@ def check_support_multiagent(alg, config): register_env("multi_agent_cartpole", lambda _: MultiAgentCartPole({"num_agents": 2})) config["log_level"] = "ERROR" - for _ in framework_iterator(config, frameworks=("torch", "tf")): + for fw in framework_iterator(config): + if fw == "tfe" and alg in ["A3C", "APEX", "APEX_DDPG", "IMPALA"]: + continue if alg in ["DDPG", "APEX_DDPG", "SAC"]: a = get_agent_class(alg)( config=config, env="multi_agent_mountaincar") diff --git a/rllib/tests/test_supported_spaces.py b/rllib/tests/test_supported_spaces.py index 313339104..fe541ef24 100644 --- a/rllib/tests/test_supported_spaces.py +++ b/rllib/tests/test_supported_spaces.py @@ -88,11 +88,7 @@ def check_support(alg, config, train=True, check_bounds=False, tfe=False): assert isinstance(a.get_policy().model, FCNetV2) if train: a.train() - try: - a.stop() - except Exception as e: - print("Ignoring error stopping agent", e) - pass + a.stop() print(stat) frameworks = ("torch", "tf") @@ -108,7 +104,7 @@ def check_support(alg, config, train=True, check_bounds=False, tfe=False): _do_check(alg, config, a_name, o_name) -class TestSupportedSpaces(unittest.TestCase): +class TestSupportedSpacesPG(unittest.TestCase): @classmethod def setUpClass(cls) -> None: ray.init(num_cpus=4) @@ -125,40 +121,6 @@ class TestSupportedSpaces(unittest.TestCase): check_support("APPO", {"num_gpus": 0, "vtrace": False}, train=False) check_support("APPO", {"num_gpus": 0, "vtrace": True}) - def test_ars(self): - check_support( - "ARS", { - "num_workers": 1, - "noise_size": 1500000, - "num_rollouts": 1, - "rollouts_used": 1 - }) - - def test_ddpg(self): - check_support( - "DDPG", { - "exploration_config": { - "ou_base_scale": 100.0 - }, - "timesteps_per_iteration": 1, - "buffer_size": 1000, - "use_state_preprocessor": True, - }, - check_bounds=True) - - def test_dqn(self): - config = {"timesteps_per_iteration": 1, "buffer_size": 1000} - check_support("DQN", config, tfe=True) - - def test_es(self): - check_support( - "ES", { - "num_workers": 1, - "noise_size": 1500000, - "episodes_per_batch": 1, - "train_batch_size": 1 - }) - def test_impala(self): check_support("IMPALA", {"num_gpus": 0}) @@ -176,21 +138,70 @@ class TestSupportedSpaces(unittest.TestCase): config = {"num_workers": 1, "optimizer": {}} check_support("PG", config, train=False, check_bounds=True, tfe=True) + +class TestSupportedSpacesOffPolicy(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + ray.init(num_cpus=4) + + @classmethod + def tearDownClass(cls) -> None: + ray.shutdown() + + def test_ddpg(self): + check_support( + "DDPG", { + "exploration_config": { + "ou_base_scale": 100.0 + }, + "timesteps_per_iteration": 1, + "buffer_size": 1000, + "use_state_preprocessor": True, + }, + check_bounds=True) + + def test_dqn(self): + config = {"timesteps_per_iteration": 1, "buffer_size": 1000} + check_support("DQN", config, tfe=True) + def test_sac(self): check_support("SAC", {"buffer_size": 1000}, check_bounds=True) +class TestSupportedSpacesEvolutionAlgos(unittest.TestCase): + @classmethod + def setUpClass(cls) -> None: + ray.init(num_cpus=4) + + @classmethod + def tearDownClass(cls) -> None: + ray.shutdown() + + def test_ars(self): + check_support( + "ARS", { + "num_workers": 1, + "noise_size": 1500000, + "num_rollouts": 1, + "rollouts_used": 1 + }) + + def test_es(self): + check_support( + "ES", { + "num_workers": 1, + "noise_size": 1500000, + "episodes_per_batch": 1, + "train_batch_size": 1 + }) + + if __name__ == "__main__": import pytest import sys - if len(sys.argv) > 1 and sys.argv[1] == "--smoke": - ACTION_SPACES_TO_TEST = { - "discrete": Discrete(5), - } - OBSERVATION_SPACES_TO_TEST = { - "vector": Box(0.0, 1.0, (5, ), dtype=np.float32), - "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32), - } - - sys.exit(pytest.main(["-v", __file__])) + # One can specify the specific TestCase class to run. + # None for all unittest.TestCase classes in this file. + class_ = sys.argv[1] if len(sys.argv) > 0 else None + sys.exit(pytest.main( + ["-v", __file__ + ("" if class_ is None else "::" + class_)])) diff --git a/rllib/utils/exploration/gaussian_noise.py b/rllib/utils/exploration/gaussian_noise.py index 34ebba45d..c2b8f674f 100644 --- a/rllib/utils/exploration/gaussian_noise.py +++ b/rllib/utils/exploration/gaussian_noise.py @@ -104,7 +104,7 @@ class GaussianNoise(Exploration): self.random_exploration.get_tf_exploration_action_op( action_dist, explore) stochastic_actions = tf.cond( - pred=ts <= self.random_timesteps, + pred=tf.convert_to_tensor(ts <= self.random_timesteps), true_fn=lambda: random_actions, false_fn=lambda: tf.clip_by_value( deterministic_actions + gaussian_sample, diff --git a/rllib/utils/exploration/ornstein_uhlenbeck_noise.py b/rllib/utils/exploration/ornstein_uhlenbeck_noise.py index 7b0f98ea8..227a74990 100644 --- a/rllib/utils/exploration/ornstein_uhlenbeck_noise.py +++ b/rllib/utils/exploration/ornstein_uhlenbeck_noise.py @@ -110,7 +110,7 @@ class OrnsteinUhlenbeckNoise(GaussianNoise): self.random_exploration.get_tf_exploration_action_op( action_dist, explore) exploration_actions = tf.cond( - pred=ts <= self.random_timesteps, + pred=tf.convert_to_tensor(ts <= self.random_timesteps), true_fn=lambda: random_actions, false_fn=lambda: stochastic_actions) diff --git a/rllib/utils/exploration/tests/test_explorations.py b/rllib/utils/exploration/tests/test_explorations.py index 910cb5d5b..8ca6c3d8d 100644 --- a/rllib/utils/exploration/tests/test_explorations.py +++ b/rllib/utils/exploration/tests/test_explorations.py @@ -28,11 +28,6 @@ def do_test_explorations(run, # Test all frameworks. for fw in framework_iterator(core_config): - if fw == "tfe" and run in [ - ddpg.DDPGTrainer, sac.SACTrainer, td3.TD3Trainer - ]: - continue - print("Agent={}".format(run)) # Test for both the default Agent's exploration AND the `Random` diff --git a/rllib/utils/exploration/tests/test_parameter_noise.py b/rllib/utils/exploration/tests/test_parameter_noise.py index b186bb70f..8483d56f7 100644 --- a/rllib/utils/exploration/tests/test_parameter_noise.py +++ b/rllib/utils/exploration/tests/test_parameter_noise.py @@ -12,8 +12,7 @@ class TestParameterNoise(unittest.TestCase): ddpg.DDPGTrainer, ddpg.DEFAULT_CONFIG, "Pendulum-v0", {}, - np.array([1.0, 0.0, -1.0]), - fws="tf") + np.array([1.0, 0.0, -1.0])) def test_dqn_parameter_noise(self): self.do_test_parameter_noise_exploration( @@ -23,18 +22,16 @@ class TestParameterNoise(unittest.TestCase): "is_slippery": False, "map_name": "4x4" }, - np.array(0), - fws=("tf", "tfe")) + np.array(0)) - def do_test_parameter_noise_exploration(self, trainer_cls, config, env, - env_config, obs, fws): + def do_test_parameter_noise_exploration( + self, trainer_cls, config, env, env_config, obs): """Tests, whether an Agent works with ParameterNoise.""" core_config = config.copy() core_config["num_workers"] = 0 # Run locally. core_config["env_config"] = env_config - for fw in framework_iterator(core_config, fws): - + for fw in framework_iterator(core_config): config = core_config.copy() # Algo with ParameterNoise exploration (config["explore"]=True). @@ -44,13 +41,15 @@ class TestParameterNoise(unittest.TestCase): trainer = trainer_cls(config=config, env=env) policy = trainer.get_policy() + pol_sess = getattr(policy, "_sess", None) + self.assertFalse(policy.exploration.weights_are_currently_noisy) noise_before = self._get_current_noise(policy, fw) check(noise_before, 0.0) initial_weights = self._get_current_weight(policy, fw) # Pseudo-start an episode and compare the weights before and after. - policy.exploration.on_episode_start(policy, tf_sess=policy._sess) + policy.exploration.on_episode_start(policy, tf_sess=pol_sess) self.assertFalse(policy.exploration.weights_are_currently_noisy) noise_after_ep_start = self._get_current_noise(policy, fw) weights_after_ep_start = self._get_current_weight(policy, fw) @@ -91,7 +90,7 @@ class TestParameterNoise(unittest.TestCase): # Pseudo-end the episode and compare weights again. # Make sure they are the original ones. - policy.exploration.on_episode_end(policy, tf_sess=policy._sess) + policy.exploration.on_episode_end(policy, tf_sess=pol_sess) weights_after_ep_end = self._get_current_weight(policy, fw) check(current_weight - noise, weights_after_ep_end, decimals=5) @@ -111,7 +110,7 @@ class TestParameterNoise(unittest.TestCase): # Pseudo-start an episode and compare the weights before and after # (they should be the same). - policy.exploration.on_episode_start(policy, tf_sess=policy._sess) + policy.exploration.on_episode_start(policy, tf_sess=pol_sess) self.assertFalse(policy.exploration.weights_are_currently_noisy) # Should be the same, as we don't do anything at the beginning of @@ -136,7 +135,7 @@ class TestParameterNoise(unittest.TestCase): # Pseudo-end the episode and compare weights again. # Make sure they are the original ones (no noise permanently # applied throughout the episode). - policy.exploration.on_episode_end(policy, tf_sess=policy._sess) + policy.exploration.on_episode_end(policy, tf_sess=pol_sess) weights_after_episode_end = self._get_current_weight(policy, fw) check(initial_weights, weights_after_episode_end) # Noise should still be the same (re-sampling only happens at @@ -170,7 +169,7 @@ class TestParameterNoise(unittest.TestCase): # the same action for the same input (parameter noise is # deterministic). policy = trainer.get_policy() - policy.exploration.on_episode_start(policy, tf_sess=policy._sess) + policy.exploration.on_episode_start(policy, tf_sess=pol_sess) a_ = trainer.compute_action(obs) for _ in range(10): a = trainer.compute_action(obs, explore=True)