From 0422e9c5a86ccb91fa4e25f003b385a451f16abc Mon Sep 17 00:00:00 2001
From: Sven Mika <sven@anyscale.io>
Date: Wed, 27 May 2020 10:19:47 +0200
Subject: [PATCH] [RLlib] Add 2 Transformer learning test cases on
 StatelessCartPole (PPO and IMPALA). (#8624)

---
 doc/source/rllib-models.rst                   |  9 ++-
 rllib/BUILD                                   | 19 +++--
 .../alpha_zero/models/custom_torch_models.py  |  4 +-
 rllib/examples/attention_net.py               |  2 +-
 rllib/examples/custom_loss.py                 |  2 +-
 rllib/examples/mobilenet_v2_with_lstm.py      |  2 +-
 rllib/examples/models/custom_loss_model.py    |  5 +-
 rllib/models/catalog.py                       | 25 +++++--
 rllib/models/tf/attention_net.py              |  2 +-
 rllib/policy/dynamic_tf_policy.py             |  2 +-
 rllib/policy/torch_policy_template.py         |  2 +-
 rllib/tests/test_attention_net_learning.py    | 74 +++++++++++++++++++
 rllib/tests/test_avail_actions_qmix.py        |  2 +-
 rllib/tests/test_catalog.py                   | 16 ++--
 14 files changed, 133 insertions(+), 33 deletions(-)
 create mode 100644 rllib/tests/test_attention_net_learning.py

diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst
index 2f82e3f75..3002bcf4a 100644
--- a/doc/source/rllib-models.rst
+++ b/doc/source/rllib-models.rst
@@ -71,7 +71,8 @@ Once implemented, the model can then be registered and used in place of a built-
     trainer = ppo.PPOTrainer(env="CartPole-v0", config={
         "model": {
             "custom_model": "my_model",
-            "custom_options": {},  # extra options to pass to your model
+            # Extra kwargs to be passed to your model's c'tor.
+            "custom_model_config": {},
         },
     })
 
@@ -132,7 +133,8 @@ Once implemented, the model can then be registered and used in place of a built-
         "use_pytorch": True,
         "model": {
             "custom_model": "my_model",
-            "custom_options": {},  # extra options to pass to your model
+            # Extra kwargs to be passed to your model's c'tor.
+            "custom_model_config": {},
         },
     })
 
@@ -165,7 +167,8 @@ Custom preprocessors should subclass the RLlib `preprocessor class <https://gith
     trainer = ppo.PPOTrainer(env="CartPole-v0", config={
         "model": {
             "custom_preprocessor": "my_prep",
-            "custom_options": {},  # extra options to pass to your preprocessor
+            # Extra kwargs to be passed to your model's c'tor.
+            "custom_model_config": {},
         },
     })
 
diff --git a/rllib/BUILD b/rllib/BUILD
index 75b8d4d45..1234af039 100644
--- a/rllib/BUILD
+++ b/rllib/BUILD
@@ -99,7 +99,7 @@ py_test(
     name = "run_regression_tests_cartpole_appo_torch",
     main = "tests/run_regression_tests.py",
     tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = [
         "tuned_examples/ppo/cartpole-appo.yaml",
@@ -184,7 +184,7 @@ py_test(
     name = "run_regression_tests_cartpole_es_tf",
     main = "tests/run_regression_tests.py",
     tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/es/cartpole-es.yaml"],
     args = ["--yaml-dir=tuned_examples/es"]
@@ -194,7 +194,7 @@ py_test(
     name = "run_regression_tests_cartpole_es_torch",
     main = "tests/run_regression_tests.py",
     tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/es/cartpole-es.yaml"],
     args = ["--yaml-dir=tuned_examples/es", "--torch"]
@@ -205,7 +205,7 @@ py_test(
     name = "run_regression_tests_cartpole_impala_tf",
     main = "tests/run_regression_tests.py",
     tags = ["learning_tests_tf", "learning_tests_cartpole"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/impala/cartpole-impala.yaml"],
     args = ["--yaml-dir=tuned_examples/impala"]
@@ -215,7 +215,7 @@ py_test(
     name = "run_regression_tests_cartpole_impala_torch",
     main = "tests/run_regression_tests.py",
     tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/impala/cartpole-impala.yaml"],
     args = ["--yaml-dir=tuned_examples/impala", "--torch"]
@@ -298,7 +298,7 @@ py_test(
     name = "run_regression_tests_cartpole_sac_torch",
     main = "tests/run_regression_tests.py",
     tags = ["learning_tests_torch", "learning_tests_cartpole"],
-    size = "medium",
+    size = "large",
     srcs = ["tests/run_regression_tests.py"],
     data = ["tuned_examples/sac/cartpole-sac.yaml"],
     args = ["--yaml-dir=tuned_examples/sac", "--torch"]
@@ -1306,6 +1306,13 @@ py_test(
 # for `tests/test_all_stuff.py`.
 # --------------------------------------------------------------------
 
+py_test(
+    name = "tests/test_attention_net_learning",
+    tags = ["tests_dir", "tests_dir_A"],
+    size = "large",
+    srcs = ["tests/test_attention_net_learning.py"]
+)
+
 py_test(
     name = "tests/test_avail_actions_qmix",
     tags = ["tests_dir", "tests_dir_A"],
diff --git a/rllib/contrib/alpha_zero/models/custom_torch_models.py b/rllib/contrib/alpha_zero/models/custom_torch_models.py
index 53bb5f06d..260c55837 100644
--- a/rllib/contrib/alpha_zero/models/custom_torch_models.py
+++ b/rllib/contrib/alpha_zero/models/custom_torch_models.py
@@ -73,8 +73,8 @@ class ConvNetModel(ActorCriticModel):
         ActorCriticModel.__init__(self, obs_space, action_space, num_outputs,
                                   model_config, name)
 
-        in_channels = model_config["custom_options"]["in_channels"]
-        feature_dim = model_config["custom_options"]["feature_dim"]
+        in_channels = model_config["custom_model_config"]["in_channels"]
+        feature_dim = model_config["custom_model_config"]["feature_dim"]
 
         self.shared_layers = nn.Sequential(
             nn.Conv2d(in_channels, 32, kernel_size=4, stride=2),
diff --git a/rllib/examples/attention_net.py b/rllib/examples/attention_net.py
index 8360b4b53..1cd91d053 100644
--- a/rllib/examples/attention_net.py
+++ b/rllib/examples/attention_net.py
@@ -50,7 +50,7 @@ if __name__ == "__main__":
         "model": {
             "custom_model": GTrXLNet,
             "max_seq_len": 50,
-            "custom_options": {
+            "custom_model_config": {
                 "num_transformer_units": 1,
                 "attn_dim": 64,
                 "num_heads": 2,
diff --git a/rllib/examples/custom_loss.py b/rllib/examples/custom_loss.py
index 4283075b2..798827570 100644
--- a/rllib/examples/custom_loss.py
+++ b/rllib/examples/custom_loss.py
@@ -53,7 +53,7 @@ if __name__ == "__main__":
         "num_workers": 0,
         "model": {
             "custom_model": "custom_loss",
-            "custom_options": {
+            "custom_model_config": {
                 "input_files": args.input_files,
             },
         },
diff --git a/rllib/examples/mobilenet_v2_with_lstm.py b/rllib/examples/mobilenet_v2_with_lstm.py
index 36529069e..c4ba89169 100644
--- a/rllib/examples/mobilenet_v2_with_lstm.py
+++ b/rllib/examples/mobilenet_v2_with_lstm.py
@@ -36,7 +36,7 @@ if __name__ == "__main__":
         "model": {
             "custom_model": "my_model",
             # Extra config passed to the custom model's c'tor as kwargs.
-            "custom_options": {
+            "custom_model_config": {
                 "cnn_shape": cnn_shape_torch if args.torch else cnn_shape,
             },
             "max_seq_len": 20,
diff --git a/rllib/examples/models/custom_loss_model.py b/rllib/examples/models/custom_loss_model.py
index fd269acdb..2d3acd1de 100644
--- a/rllib/examples/models/custom_loss_model.py
+++ b/rllib/examples/models/custom_loss_model.py
@@ -38,7 +38,8 @@ class CustomLossModel(TFModelV2):
     @override(ModelV2)
     def custom_loss(self, policy_loss, loss_inputs):
         # Create a new input reader per worker.
-        reader = JsonReader(self.model_config["custom_options"]["input_files"])
+        reader = JsonReader(
+            self.model_config["custom_model_config"]["input_files"])
         input_ops = reader.tf_input_ops()
 
         # Define a secondary loss by building a graph copy with weight sharing.
@@ -80,7 +81,7 @@ class DeprecatedCustomLossModelV1(Model):
 
     def custom_loss(self, policy_loss, loss_inputs):
         # create a new input reader per worker
-        reader = JsonReader(self.options["custom_options"]["input_files"])
+        reader = JsonReader(self.options["custom_model_config"]["input_files"])
         input_ops = reader.tf_input_ops()
 
         # define a secondary loss by building a graph copy with weight sharing
diff --git a/rllib/models/catalog.py b/rllib/models/catalog.py
index f226b4532..2ed0220b6 100644
--- a/rllib/models/catalog.py
+++ b/rllib/models/catalog.py
@@ -23,7 +23,7 @@ from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
     TorchMultiActionDistribution, TorchMultiCategorical
 from ray.rllib.utils import try_import_tf, try_import_tree
 from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
-from ray.rllib.utils.deprecation import deprecation_warning
+from ray.rllib.utils.deprecation import deprecation_warning, DEPRECATED_VALUE
 from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.space_utils import flatten_space
 
@@ -81,14 +81,17 @@ MODEL_DEFAULTS = {
     # === Options for custom models ===
     # Name of a custom model to use
     "custom_model": None,
+    # Extra options to pass to the custom classes.
+    # These will be available in the Model's
+    "custom_model_config": {},
     # Name of a custom action distribution to use.
     "custom_action_dist": None,
-
-    # Extra options to pass to the custom classes
-    "custom_options": {},
     # Custom preprocessors are deprecated. Please use a wrapper class around
     # your environment instead to preprocess observations.
     "custom_preprocessor": None,
+
+    # Deprecated config keys.
+    "custom_options": DEPRECATED_VALUE,
 }
 # __sphinx_doc_end__
 # yapf: enable
@@ -280,6 +283,16 @@ class ModelCatalog:
         """
 
         if model_config.get("custom_model"):
+
+            if "custom_options" in model_config and \
+                    model_config["custom_options"] != DEPRECATED_VALUE:
+                deprecation_warning(
+                    "model.custom_options",
+                    "model.custom_model_config",
+                    error=False)
+                model_config["custom_model_config"] = \
+                    model_config.pop("custom_options")
+
             if isinstance(model_config["custom_model"], type):
                 model_cls = model_config["custom_model"]
             else:
@@ -304,7 +317,7 @@ class ModelCatalog:
                     with tf.variable_creator_scope(track_var_creation):
                         # Try calling with kwargs first (custom ModelV2 should
                         # accept these as kwargs, not get them from
-                        # config["custom_options"] anymore)
+                        # config["custom_model_config"] anymore).
                         try:
                             instance = model_cls(obs_space, action_space,
                                                  num_outputs, model_config,
@@ -315,7 +328,7 @@ class ModelCatalog:
                                 logger.warning(
                                     "Custom ModelV2 should accept all custom "
                                     "options as **kwargs, instead of expecting"
-                                    " them in config['custom_options']!")
+                                    " them in config['custom_model_config']!")
                                 instance = model_cls(obs_space, action_space,
                                                      num_outputs, model_config,
                                                      name)
diff --git a/rllib/models/tf/attention_net.py b/rllib/models/tf/attention_net.py
index d4e25630c..513e99dda 100644
--- a/rllib/models/tf/attention_net.py
+++ b/rllib/models/tf/attention_net.py
@@ -146,7 +146,7 @@ class GTrXLNet(RecurrentNetwork):
     Examples:
         >> config["model"]["custom_model"] = GTrXLNet
         >> config["model"]["max_seq_len"] = 10
-        >> config["model"]["custom_options"] = {
+        >> config["model"]["custom_model_config"] = {
         >>     num_transformer_units=1,
         >>     attn_dim=32,
         >>     num_heads=2,
diff --git a/rllib/policy/dynamic_tf_policy.py b/rllib/policy/dynamic_tf_policy.py
index 072181e62..64c7f9cee 100644
--- a/rllib/policy/dynamic_tf_policy.py
+++ b/rllib/policy/dynamic_tf_policy.py
@@ -161,7 +161,7 @@ class DynamicTFPolicy(TFPolicy):
                 num_outputs=logit_dim,
                 model_config=self.config["model"],
                 framework="tf",
-                **self.config["model"].get("custom_options", {}))
+                **self.config["model"].get("custom_model_config", {}))
 
         # Create the Exploration object to use for this Policy.
         self.exploration = self._create_exploration()
diff --git a/rllib/policy/torch_policy_template.py b/rllib/policy/torch_policy_template.py
index 542214efc..e629f5c3c 100644
--- a/rllib/policy/torch_policy_template.py
+++ b/rllib/policy/torch_policy_template.py
@@ -117,7 +117,7 @@ def build_torch_policy(name,
                     num_outputs=logit_dim,
                     model_config=self.config["model"],
                     framework="torch",
-                    **self.config["model"].get("custom_options", {}))
+                    **self.config["model"].get("custom_model_config", {}))
 
             # Make sure, we passed in a correct Model factory.
             assert isinstance(self.model, TorchModelV2), \
diff --git a/rllib/tests/test_attention_net_learning.py b/rllib/tests/test_attention_net_learning.py
new file mode 100644
index 000000000..7b1f169a7
--- /dev/null
+++ b/rllib/tests/test_attention_net_learning.py
@@ -0,0 +1,74 @@
+import unittest
+
+from ray import tune
+from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
+from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.tf.attention_net import GTrXLNet
+
+
+class TestAttentionNetLearning(unittest.TestCase):
+
+    config = {
+        "env": StatelessCartPole,
+        "gamma": 0.99,
+        "num_envs_per_worker": 20,
+        # "framework": "tf",
+    }
+
+    stop = {
+        "episode_reward_mean": 180.0,
+        "timesteps_total": 5000000,
+    }
+
+    def test_ppo_attention_net_learning(self):
+        ModelCatalog.register_custom_model("attention_net", GTrXLNet)
+        config = dict(
+            self.config, **{
+                "num_workers": 0,
+                "entropy_coeff": 0.001,
+                "vf_loss_coeff": 1e-5,
+                "num_sgd_iter": 5,
+                "model": {
+                    "custom_model": "attention_net",
+                    "max_seq_len": 10,
+                    "custom_model_config": {
+                        "num_transformer_units": 1,
+                        "attn_dim": 32,
+                        "num_heads": 1,
+                        "memory_tau": 5,
+                        "head_dim": 32,
+                        "ff_hidden_dim": 32,
+                    },
+                },
+            })
+        tune.run("PPO", config=config, stop=self.stop, verbose=1)
+
+    def test_impala_attention_net_learning(self):
+        ModelCatalog.register_custom_model("attention_net", GTrXLNet)
+        config = dict(
+            self.config, **{
+                "num_workers": 4,
+                "num_gpus": 0,
+                "entropy_coeff": 0.01,
+                "vf_loss_coeff": 0.001,
+                "lr": 0.0008,
+                "model": {
+                    "custom_model": "attention_net",
+                    "max_seq_len": 65,
+                    "custom_model_config": {
+                        "num_transformer_units": 1,
+                        "attn_dim": 64,
+                        "num_heads": 1,
+                        "memory_tau": 10,
+                        "head_dim": 32,
+                        "ff_hidden_dim": 32,
+                    },
+                },
+            })
+        tune.run("IMPALA", config=config, stop=self.stop, verbose=1)
+
+
+if __name__ == "__main__":
+    import pytest
+    import sys
+    sys.exit(pytest.main(["-v", __file__]))
diff --git a/rllib/tests/test_avail_actions_qmix.py b/rllib/tests/test_avail_actions_qmix.py
index c1c0b6063..774734ef6 100644
--- a/rllib/tests/test_avail_actions_qmix.py
+++ b/rllib/tests/test_avail_actions_qmix.py
@@ -4,8 +4,8 @@ import unittest
 
 import ray
 from ray.tune import register_env
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.agents.qmix import QMixTrainer
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
 
 
 class AvailActionsTestEnv(MultiAgentEnv):
diff --git a/rllib/tests/test_catalog.py b/rllib/tests/test_catalog.py
index f80a13a81..9c2c1f3da 100644
--- a/rllib/tests/test_catalog.py
+++ b/rllib/tests/test_catalog.py
@@ -35,19 +35,21 @@ class CustomModel(TFModelV2):
 class CustomActionDistribution(TFActionDistribution):
     def __init__(self, inputs, model):
         # Store our output shape.
-        custom_options = model.model_config["custom_options"]
-        if "output_dim" in custom_options:
+        custom_model_config = model.model_config["custom_model_config"]
+        if "output_dim" in custom_model_config:
             self.output_shape = tf.concat(
-                [tf.shape(inputs)[:1], custom_options["output_dim"]], axis=0)
+                [tf.shape(inputs)[:1], custom_model_config["output_dim"]],
+                axis=0)
         else:
             self.output_shape = tf.shape(inputs)
         super().__init__(inputs, model)
 
     @staticmethod
     def required_model_output_shape(action_space, model_config=None):
-        custom_options = model_config["custom_options"] or {}
-        if custom_options is not None and custom_options.get("output_dim"):
-            return custom_options.get("output_dim")
+        custom_model_config = model_config["custom_model_config"] or {}
+        if custom_model_config is not None and \
+                custom_model_config.get("output_dim"):
+            return custom_model_config.get("output_dim")
         return action_space.shape
 
     @override(TFActionDistribution)
@@ -157,7 +159,7 @@ class ModelCatalogTest(unittest.TestCase):
             dist.entropy()
 
         # test passing the options to it
-        model_config["custom_options"].update({"output_dim": (3, )})
+        model_config["custom_model_config"].update({"output_dim": (3, )})
         dist_cls, param_shape = ModelCatalog.get_action_dist(
             action_space, model_config)
         self.assertEqual(param_shape, (3, ))