From a90cd0fcbb81c2be472b16c61bebc75125d16133 Mon Sep 17 00:00:00 2001
From: Sven Mika <sven@anyscale.io>
Date: Thu, 11 Jun 2020 14:29:57 +0200
Subject: [PATCH] [RLlib] Unity3d soccer benchmarks (#8834)

---
 doc/source/rllib-algorithms.rst               | 10 ++-
 rllib/agents/trainer.py                       | 24 +++----
 rllib/contrib/bandits/envs/discrete.py        |  9 ++-
 rllib/env/unity3d_env.py                      | 49 +++++++++----
 rllib/evaluation/worker_set.py                |  5 +-
 rllib/examples/unity3d_env_local.py           | 68 +++++++++++++++----
 .../ppo/repeatafterme-ppo-lstm.yaml           |  2 +-
 ...unity3d-soccer-strikers-vs-goalie-ppo.yaml | 46 +++++++++++++
 8 files changed, 162 insertions(+), 51 deletions(-)
 create mode 100644 rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml

diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst
index a47399122..b4880cfc2 100644
--- a/doc/source/rllib-algorithms.rst
+++ b/doc/source/rllib-algorithms.rst
@@ -333,7 +333,15 @@ PPO's clipped objective supports multiple SGD passes over the same batch of expe
 
     PPO architecture
 
-Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__, `Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__, `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__, `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
+Tuned examples:
+`Unity3D Soccer (multi-agent: Strikers vs Goalie) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml>`__,
+`Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__,
+`Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__,
+`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__,
+`PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__,
+`Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__,
+`HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__,
+`{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
 
 
 **Atari results**: `more details <https://github.com/ray-project/rl-experiments>`__
diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py
index 467ff8bff..b578f3ec4 100644
--- a/rllib/agents/trainer.py
+++ b/rllib/agents/trainer.py
@@ -958,24 +958,18 @@ class Trainer(Trainable):
     @staticmethod
     def _validate_config(config: dict):
         if "policy_graphs" in config["multiagent"]:
-            logger.warning(
-                "The `policy_graphs` config has been renamed to `policies`.")
-            # Backwards compatibility
-            config["multiagent"]["policies"] = config["multiagent"][
-                "policy_graphs"]
-            del config["multiagent"]["policy_graphs"]
+            deprecation_warning("policy_graphs", "policies")
+            # Backwards compatibility.
+            config["multiagent"]["policies"] = config["multiagent"].pop(
+                "policy_graphs")
         if "gpu" in config:
-            raise ValueError(
-                "The `gpu` config is deprecated, please use `num_gpus=0|1` "
-                "instead.")
+            deprecation_warning("gpu", "num_gpus=0|1", error=True)
         if "gpu_fraction" in config:
-            raise ValueError(
-                "The `gpu_fraction` config is deprecated, please use "
-                "`num_gpus=<fraction>` instead.")
+            deprecation_warning(
+                "gpu_fraction", "num_gpus=<fraction>", error=True)
         if "use_gpu_for_workers" in config:
-            raise ValueError(
-                "The `use_gpu_for_workers` config is deprecated, please use "
-                "`num_gpus_per_worker=1` instead.")
+            deprecation_warning(
+                "use_gpu_for_workers", "num_gpus_per_worker=1", error=True)
         if type(config["input_evaluation"]) != list:
             raise ValueError(
                 "`input_evaluation` must be a list of strings, got {}".format(
diff --git a/rllib/contrib/bandits/envs/discrete.py b/rllib/contrib/bandits/envs/discrete.py
index 5342df3b3..540bd46ff 100644
--- a/rllib/contrib/bandits/envs/discrete.py
+++ b/rllib/contrib/bandits/envs/discrete.py
@@ -14,11 +14,10 @@ DEFAULT_CONFIG_LINEAR = {
 class LinearDiscreteEnv(gym.Env):
     """Samples data from linearly parameterized arms.
 
-      The reward for context X and arm i is given by X^T * theta_i, for some
-      latent set of parameters {theta_i : i = 1, ..., k}. The thetas are sampled
-      uniformly at random, the contexts are Gaussian, and Gaussian noise is
-      added to the rewards.
-
+    The reward for context X and arm i is given by X^T * theta_i, for some
+    latent set of parameters {theta_i : i = 1, ..., k}.
+    The thetas are sampled uniformly at random, the contexts are Gaussian,
+    and Gaussian noise is added to the rewards.
     """
 
     def __init__(self, config=None):
diff --git a/rllib/env/unity3d_env.py b/rllib/env/unity3d_env.py
index c66a6c14c..66c7337d0 100644
--- a/rllib/env/unity3d_env.py
+++ b/rllib/env/unity3d_env.py
@@ -13,7 +13,10 @@ logger = logging.getLogger(__name__)
 class Unity3DEnv(MultiAgentEnv):
     """A MultiAgentEnv representing a single Unity3D game instance.
 
-    For an example on how to use this class inside a Unity game client, which
+    For an example on how to use this Env with a running Unity3D editor
+    or with a compiled game, see:
+    `rllib/examples/unity3d_env_local.py`
+    For an example on how to use it inside a Unity game client, which
     connects to an RLlib Policy server, see:
     `rllib/examples/serving/unity3d_[client|server].py`
 
@@ -191,42 +194,60 @@ class Unity3DEnv(MultiAgentEnv):
         # The RLlib server must know about the Spaces that the Client will be
         # using inside Unity3D, up-front.
         obs_spaces = {
+            # 3DBall.
+            "3DBall": Box(float("-inf"), float("inf"), (8, )),
+            # 3DBallHard.
+            "3DBallHard": Box(float("-inf"), float("inf"), (45, )),
             # SoccerStrikersVsGoalie.
+            "Goalie": Box(float("-inf"), float("inf"), (738, )),
             "Striker": Tuple([
                 Box(float("-inf"), float("inf"), (231, )),
                 Box(float("-inf"), float("inf"), (63, )),
             ]),
-            "Goalie": Box(float("-inf"), float("inf"), (738, )),
-            # 3DBall.
-            "Agent": Box(float("-inf"), float("inf"), (8, )),
+            # Tennis.
+            "Tennis": Box(float("-inf"), float("inf"), (27, )),
+            # VisualHallway.
+            "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)),
+            # Walker.
+            "Walker": Box(float("-inf"), float("inf"), (212, )),
         }
         action_spaces = {
-            # SoccerStrikersVsGoalie.
-            "Striker": MultiDiscrete([3, 3, 3]),
-            "Goalie": MultiDiscrete([3, 3, 3]),
             # 3DBall.
-            "Agent": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32),
+            "3DBall": Box(
+                float("-inf"), float("inf"), (2, ), dtype=np.float32),
+            # 3DBallHard.
+            "3DBallHard": Box(
+                float("-inf"), float("inf"), (2, ), dtype=np.float32),
+            # SoccerStrikersVsGoalie.
+            "Goalie": MultiDiscrete([3, 3, 3]),
+            "Striker": MultiDiscrete([3, 3, 3]),
+            # Tennis.
+            "Tennis": Box(float("-inf"), float("inf"), (3, )),
+            # VisualHallway.
+            "VisualHallway": MultiDiscrete([5]),
+            # Walker.
+            "Walker": Box(float("-inf"), float("inf"), (39, )),
         }
 
         # Policies (Unity: "behaviors") and agent-to-policy mapping fns.
         if game_name == "SoccerStrikersVsGoalie":
             policies = {
-                "Striker": (None, obs_spaces["Striker"],
-                            action_spaces["Striker"], {}),
                 "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"],
                            {}),
+                "Striker": (None, obs_spaces["Striker"],
+                            action_spaces["Striker"], {}),
             }
 
             def policy_mapping_fn(agent_id):
                 return "Striker" if "Striker" in agent_id else "Goalie"
 
-        else:  # 3DBall
+        else:
             policies = {
-                "Agent": (None, obs_spaces["Agent"], action_spaces["Agent"],
-                          {})
+                game_name: (None, obs_spaces[game_name],
+                            action_spaces[game_name], {}),
             }
 
             def policy_mapping_fn(agent_id):
-                return "Agent"
+                return game_name
 
         return policies, policy_mapping_fn
diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py
index 01198d5a1..5e52a0fc1 100644
--- a/rllib/evaluation/worker_set.py
+++ b/rllib/evaluation/worker_set.py
@@ -7,7 +7,8 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \
     _validate_multiagent_config
 from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \
     ShuffledInput
-from ray.rllib.utils import merge_dicts, try_import_tf
+from ray.rllib.utils import merge_dicts
+from ray.rllib.utils.framework import try_import_tf
 
 tf = try_import_tf()
 
@@ -226,7 +227,7 @@ class WorkerSet:
         else:
             input_evaluation = config["input_evaluation"]
 
-        # Fill in the default policy if 'None' is specified in multiagent
+        # Fill in the default policy if 'None' is specified in multiagent.
         if config["multiagent"]["policies"]:
             tmp = config["multiagent"]["policies"]
             _validate_multiagent_config(tmp, allow_none_graph=True)
diff --git a/rllib/examples/unity3d_env_local.py b/rllib/examples/unity3d_env_local.py
index 380eb527e..83b5bbb04 100644
--- a/rllib/examples/unity3d_env_local.py
+++ b/rllib/examples/unity3d_env_local.py
@@ -10,7 +10,8 @@ To run this script against a local Unity3D engine:
 2) Open the Unity3D Editor and load an example scene from the following
    ml-agents pip package location:
    `.../ml-agents/Project/Assets/ML-Agents/Examples/`
-   This script supports the `3DBall` and `SoccerStrikersVsGoalie` examples.
+   This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`,
+    `Tennis`, and `Walker` examples.
    Specify the game you chose on your command line via e.g. `--env 3DBall`.
    Feel free to add more supported examples here.
 
@@ -31,13 +32,31 @@ parser.add_argument(
     "--env",
     type=str,
     default="3DBall",
-    choices=["3DBall", "SoccerStrikersVsGoalie"],
-    help="The name of the Env to run in the Unity3D editor. Either `3DBall` "
-    "or `SoccerStrikersVsGoalie` (feel free to add more to this script!)")
+    choices=[
+        "3DBall", "3DBallHard", "SoccerStrikersVsGoalie", "Tennis",
+        "VisualHallway", "Walker"
+    ],
+    help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|"
+    "SoccerStrikersVsGoalie|Tennis|VisualHallway|Walker` (feel free to add "
+    "more and PR!)")
+parser.add_argument(
+    "--file-name",
+    type=str,
+    default=None,
+    help="The Unity3d binary (compiled) game, e.g. "
+    "'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for "
+    "a currently running Unity3D editor.")
+parser.add_argument(
+    "--from-checkpoint",
+    type=str,
+    default=None,
+    help="Full path to a checkpoint file for restoring a previously saved "
+    "Trainer state.")
+parser.add_argument("--num-workers", type=int, default=0)
 parser.add_argument("--as-test", action="store_true")
-parser.add_argument("--stop-iters", type=int, default=150)
+parser.add_argument("--stop-iters", type=int, default=9999)
 parser.add_argument("--stop-reward", type=float, default=9999.0)
-parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-timesteps", type=int, default=10000000)
 parser.add_argument(
     "--horizon",
     type=int,
@@ -53,7 +72,9 @@ if __name__ == "__main__":
 
     tune.register_env(
         "unity3d",
-        lambda c: Unity3DEnv(episode_horizon=c.get("episode_horizon", 1000)))
+        lambda c: Unity3DEnv(
+            file_name=c["file_name"],
+            episode_horizon=c["episode_horizon"]))
 
     # Get policies (different agent types; "behaviors" in MLAgents) and
     # the mappings from individual agents to Policies.
@@ -63,20 +84,35 @@ if __name__ == "__main__":
     config = {
         "env": "unity3d",
         "env_config": {
+            "file_name": args.file_name,
             "episode_horizon": args.horizon,
         },
-        # IMPORTANT: Just use one Worker (we only have one Unity running)!
-        "num_workers": 0,
+        # For running in editor, force to use just one Worker (we only have
+        # one Unity running)!
+        "num_workers": args.num_workers if args.file_name else 0,
         # Other settings.
-        "sample_batch_size": 64,
-        "train_batch_size": 256,
-        "rollout_fragment_length": 20,
+        "lr": 0.0003,
+        "lambda": 0.95,
+        "gamma": 0.99,
+        "sgd_minibatch_size": 256,
+        "train_batch_size": 4000,
+        "num_sgd_iter": 20,
+        "rollout_fragment_length": 200,
+        "clip_param": 0.2,
         # Multi-agent setup for the particular env.
         "multiagent": {
             "policies": policies,
             "policy_mapping_fn": policy_mapping_fn,
         },
+        "model": {
+            "fcnet_hiddens": [512, 512],
+        },
         "framework": "tf",
+        "no_done_at_end": True,
+        # If no executable is provided (use Unity3D editor), do not evaluate,
+        # b/c the editor only allows one connection at a time.
+        "evaluation_interval": 10 if args.file_name else 0,
+        "evaluation_num_episodes": 1,
     }
 
     stop = {
@@ -86,7 +122,13 @@ if __name__ == "__main__":
     }
 
     # Run the experiment.
-    results = tune.run("PPO", config=config, stop=stop, verbose=1)
+    results = tune.run(
+        "PPO",
+        config=config,
+        stop=stop,
+        verbose=1,
+        checkpoint_freq=10,
+        restore=args.from_checkpoint)
 
     # And check the results.
     if args.as_test:
diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml
index e737cb09d..700e420d7 100644
--- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml
+++ b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml
@@ -1,5 +1,5 @@
 repeat-after-me-ppo-w-lstm:
-    env: "ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv"
+    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
     run: PPO
     stop:
         episode_reward_mean: 50
diff --git a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml
new file mode 100644
index 000000000..5a089f5a6
--- /dev/null
+++ b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml
@@ -0,0 +1,46 @@
+# NOTE: This example will not run w/o a proper config.multiagent setup,
+#       which currently cannot be done in yaml.
+
+# This setup should learn a decent (not perfect) policy within 100k timesteps
+# on a single GPU machine (16 CPUS) using 10 workers (collecting data from
+# 10 compiled game binaries in parallel).
+# Reported rewards will be the sum of both strikers (+1 if goal) plus the
+# goalie's reward (-1 if goal) across all within-scene parallelized playing
+# fields (8 fields with each 2 strikers + 1 goalie, for the soccer env).
+unity3d-soccer-strikers-vs-goalie-ppo:
+    env: ray.rllib.env.unity3d_env.Unity3DEnv
+    run: PPO
+    stop:
+        timesteps_total: 1000000
+    config:
+        # NOTE: This example will not run w/o the following multiagent setup:
+        # Multi-agent setup for SoccerStrikersVsGoalie Unity3D Env.
+        # multiagent:
+        #    policies: [policies list]
+        #    policy_mapping_fn: [agent-to-policy mapping function]
+
+        # Works for both torch and tf.
+        framework: tf
+        env_config:
+          # Put the path to your compiled game executable here.
+          file_name: /home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64
+          # Timesteps after which a hard-reset will happen (all agents).
+          episode_horizon: 3000
+        lr: 0.0003
+        lambda: 0.95
+        gamma: 0.99
+        sgd_minibatch_size: 256
+        train_batch_size: 4000
+        clip_param: 0.2
+        # For running in editor, just use one Worker (we only have
+        # one Unity running)!
+        num_workers: 10
+        num_sgd_iter: 20
+        rollout_fragment_length: 200
+        no_done_at_end: true
+        model:
+          fcnet_hiddens: [512, 512]
+        # If no executable is provided (use Unity3D editor), do not evaluate,
+        # b/c the editor only allows one connection at a time.
+        evaluation_interval: 0
+        evaluation_num_episodes: 1