From 0422e9c5a86ccb91fa4e25f003b385a451f16abc Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Wed, 27 May 2020 10:19:47 +0200 Subject: [PATCH] [RLlib] Add 2 Transformer learning test cases on StatelessCartPole (PPO and IMPALA). (#8624) --- doc/source/rllib-models.rst | 9 ++- rllib/BUILD | 19 +++-- .../alpha_zero/models/custom_torch_models.py | 4 +- rllib/examples/attention_net.py | 2 +- rllib/examples/custom_loss.py | 2 +- rllib/examples/mobilenet_v2_with_lstm.py | 2 +- rllib/examples/models/custom_loss_model.py | 5 +- rllib/models/catalog.py | 25 +++++-- rllib/models/tf/attention_net.py | 2 +- rllib/policy/dynamic_tf_policy.py | 2 +- rllib/policy/torch_policy_template.py | 2 +- rllib/tests/test_attention_net_learning.py | 74 +++++++++++++++++++ rllib/tests/test_avail_actions_qmix.py | 2 +- rllib/tests/test_catalog.py | 16 ++-- 14 files changed, 133 insertions(+), 33 deletions(-) create mode 100644 rllib/tests/test_attention_net_learning.py diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 2f82e3f75..3002bcf4a 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -71,7 +71,8 @@ Once implemented, the model can then be registered and used in place of a built- trainer = ppo.PPOTrainer(env="CartPole-v0", config={ "model": { "custom_model": "my_model", - "custom_options": {}, # extra options to pass to your model + # Extra kwargs to be passed to your model's c'tor. + "custom_model_config": {}, }, }) @@ -132,7 +133,8 @@ Once implemented, the model can then be registered and used in place of a built- "use_pytorch": True, "model": { "custom_model": "my_model", - "custom_options": {}, # extra options to pass to your model + # Extra kwargs to be passed to your model's c'tor. + "custom_model_config": {}, }, }) @@ -165,7 +167,8 @@ Custom preprocessors should subclass the RLlib `preprocessor class > config["model"]["custom_model"] = GTrXLNet >> config["model"]["max_seq_len"] = 10 - >> config["model"]["custom_options"] = { + >> config["model"]["custom_model_config"] = { >> num_transformer_units=1, >> attn_dim=32, >> num_heads=2, diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py index 072181e62..64c7f9cee 100644 --- a/rllib/policy/dynamic_tf_policy.py +++ b/rllib/policy/dynamic_tf_policy.py @@ -161,7 +161,7 @@ class DynamicTFPolicy(TFPolicy): num_outputs=logit_dim, model_config=self.config["model"], framework="tf", - **self.config["model"].get("custom_options", {})) + **self.config["model"].get("custom_model_config", {})) # Create the Exploration object to use for this Policy. self.exploration = self._create_exploration() diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py index 542214efc..e629f5c3c 100644 --- a/rllib/policy/torch_policy_template.py +++ b/rllib/policy/torch_policy_template.py @@ -117,7 +117,7 @@ def build_torch_policy(name, num_outputs=logit_dim, model_config=self.config["model"], framework="torch", - **self.config["model"].get("custom_options", {})) + **self.config["model"].get("custom_model_config", {})) # Make sure, we passed in a correct Model factory. assert isinstance(self.model, TorchModelV2), \ diff --git a/rllib/tests/test_attention_net_learning.py b/rllib/tests/test_attention_net_learning.py new file mode 100644 index 000000000..7b1f169a7 --- /dev/null +++ b/rllib/tests/test_attention_net_learning.py @@ -0,0 +1,74 @@ +import unittest + +from ray import tune +from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole +from ray.rllib.models.catalog import ModelCatalog +from ray.rllib.models.tf.attention_net import GTrXLNet + + +class TestAttentionNetLearning(unittest.TestCase): + + config = { + "env": StatelessCartPole, + "gamma": 0.99, + "num_envs_per_worker": 20, + # "framework": "tf", + } + + stop = { + "episode_reward_mean": 180.0, + "timesteps_total": 5000000, + } + + def test_ppo_attention_net_learning(self): + ModelCatalog.register_custom_model("attention_net", GTrXLNet) + config = dict( + self.config, **{ + "num_workers": 0, + "entropy_coeff": 0.001, + "vf_loss_coeff": 1e-5, + "num_sgd_iter": 5, + "model": { + "custom_model": "attention_net", + "max_seq_len": 10, + "custom_model_config": { + "num_transformer_units": 1, + "attn_dim": 32, + "num_heads": 1, + "memory_tau": 5, + "head_dim": 32, + "ff_hidden_dim": 32, + }, + }, + }) + tune.run("PPO", config=config, stop=self.stop, verbose=1) + + def test_impala_attention_net_learning(self): + ModelCatalog.register_custom_model("attention_net", GTrXLNet) + config = dict( + self.config, **{ + "num_workers": 4, + "num_gpus": 0, + "entropy_coeff": 0.01, + "vf_loss_coeff": 0.001, + "lr": 0.0008, + "model": { + "custom_model": "attention_net", + "max_seq_len": 65, + "custom_model_config": { + "num_transformer_units": 1, + "attn_dim": 64, + "num_heads": 1, + "memory_tau": 10, + "head_dim": 32, + "ff_hidden_dim": 32, + }, + }, + }) + tune.run("IMPALA", config=config, stop=self.stop, verbose=1) + + +if __name__ == "__main__": + import pytest + import sys + sys.exit(pytest.main(["-v", __file__])) diff --git a/rllib/tests/test_avail_actions_qmix.py b/rllib/tests/test_avail_actions_qmix.py index c1c0b6063..774734ef6 100644 --- a/rllib/tests/test_avail_actions_qmix.py +++ b/rllib/tests/test_avail_actions_qmix.py @@ -4,8 +4,8 @@ import unittest import ray from ray.tune import register_env -from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.agents.qmix import QMixTrainer +from ray.rllib.env.multi_agent_env import MultiAgentEnv class AvailActionsTestEnv(MultiAgentEnv): diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py index f80a13a81..9c2c1f3da 100644 --- a/rllib/tests/test_catalog.py +++ b/rllib/tests/test_catalog.py @@ -35,19 +35,21 @@ class CustomModel(TFModelV2): class CustomActionDistribution(TFActionDistribution): def __init__(self, inputs, model): # Store our output shape. - custom_options = model.model_config["custom_options"] - if "output_dim" in custom_options: + custom_model_config = model.model_config["custom_model_config"] + if "output_dim" in custom_model_config: self.output_shape = tf.concat( - [tf.shape(inputs)[:1], custom_options["output_dim"]], axis=0) + [tf.shape(inputs)[:1], custom_model_config["output_dim"]], + axis=0) else: self.output_shape = tf.shape(inputs) super().__init__(inputs, model) @staticmethod def required_model_output_shape(action_space, model_config=None): - custom_options = model_config["custom_options"] or {} - if custom_options is not None and custom_options.get("output_dim"): - return custom_options.get("output_dim") + custom_model_config = model_config["custom_model_config"] or {} + if custom_model_config is not None and \ + custom_model_config.get("output_dim"): + return custom_model_config.get("output_dim") return action_space.shape @override(TFActionDistribution) @@ -157,7 +159,7 @@ class ModelCatalogTest(unittest.TestCase): dist.entropy() # test passing the options to it - model_config["custom_options"].update({"output_dim": (3, )}) + model_config["custom_model_config"].update({"output_dim": (3, )}) dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(param_shape, (3, ))