[RLlib] Unity3d soccer benchmarks (#8834)

2026-07-03 12:03:10 +08:00 · 2020-06-11 14:29:57 +02:00
parent 9166e22085
commit a90cd0fcbb
8 changed files with 162 additions and 51 deletions
@@ -333,7 +333,15 @@ PPO's clipped objective supports multiple SGD passes over the same batch of expe

    PPO architecture

-Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__, `Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__, `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__, `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
+Tuned examples:
+`Unity3D Soccer (multi-agent: Strikers vs Goalie) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml>`__,
+`Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__,
+`Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__,
+`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__,
+`PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__,
+`Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__,
+`HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__,
+`{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__


 **Atari results**: `more details <https://github.com/ray-project/rl-experiments>`__
@@ -958,24 +958,18 @@ class Trainer(Trainable):
    @staticmethod
    def _validate_config(config: dict):
        if "policy_graphs" in config["multiagent"]:
-            logger.warning(
-                "The `policy_graphs` config has been renamed to `policies`.")
-            # Backwards compatibility
-            config["multiagent"]["policies"] = config["multiagent"][
-                "policy_graphs"]
-            del config["multiagent"]["policy_graphs"]
+            deprecation_warning("policy_graphs", "policies")
+            # Backwards compatibility.
+            config["multiagent"]["policies"] = config["multiagent"].pop(
+                "policy_graphs")
        if "gpu" in config:
-            raise ValueError(
-                "The `gpu` config is deprecated, please use `num_gpus=0|1` "
-                "instead.")
+            deprecation_warning("gpu", "num_gpus=0|1", error=True)
        if "gpu_fraction" in config:
-            raise ValueError(
-                "The `gpu_fraction` config is deprecated, please use "
-                "`num_gpus=<fraction>` instead.")
+            deprecation_warning(
+                "gpu_fraction", "num_gpus=<fraction>", error=True)
        if "use_gpu_for_workers" in config:
-            raise ValueError(
-                "The `use_gpu_for_workers` config is deprecated, please use "
-                "`num_gpus_per_worker=1` instead.")
+            deprecation_warning(
+                "use_gpu_for_workers", "num_gpus_per_worker=1", error=True)
        if type(config["input_evaluation"]) != list:
            raise ValueError(
                "`input_evaluation` must be a list of strings, got {}".format(
@@ -14,11 +14,10 @@ DEFAULT_CONFIG_LINEAR = {
 class LinearDiscreteEnv(gym.Env):
    """Samples data from linearly parameterized arms.

-      The reward for context X and arm i is given by X^T * theta_i, for some
-      latent set of parameters {theta_i : i = 1, ..., k}. The thetas are sampled
-      uniformly at random, the contexts are Gaussian, and Gaussian noise is
-      added to the rewards.
-
+    The reward for context X and arm i is given by X^T * theta_i, for some
+    latent set of parameters {theta_i : i = 1, ..., k}.
+    The thetas are sampled uniformly at random, the contexts are Gaussian,
+    and Gaussian noise is added to the rewards.
    """

    def __init__(self, config=None):
@@ -13,7 +13,10 @@ logger = logging.getLogger(__name__)
 class Unity3DEnv(MultiAgentEnv):
    """A MultiAgentEnv representing a single Unity3D game instance.

-    For an example on how to use this class inside a Unity game client, which
+    For an example on how to use this Env with a running Unity3D editor
+    or with a compiled game, see:
+    `rllib/examples/unity3d_env_local.py`
+    For an example on how to use it inside a Unity game client, which
    connects to an RLlib Policy server, see:
    `rllib/examples/serving/unity3d_[client|server].py`

@@ -191,42 +194,60 @@ class Unity3DEnv(MultiAgentEnv):
        # The RLlib server must know about the Spaces that the Client will be
        # using inside Unity3D, up-front.
        obs_spaces = {
+            # 3DBall.
+            "3DBall": Box(float("-inf"), float("inf"), (8, )),
+            # 3DBallHard.
+            "3DBallHard": Box(float("-inf"), float("inf"), (45, )),
            # SoccerStrikersVsGoalie.
+            "Goalie": Box(float("-inf"), float("inf"), (738, )),
            "Striker": Tuple([
                Box(float("-inf"), float("inf"), (231, )),
                Box(float("-inf"), float("inf"), (63, )),
            ]),
-            "Goalie": Box(float("-inf"), float("inf"), (738, )),
-            # 3DBall.
-            "Agent": Box(float("-inf"), float("inf"), (8, )),
+            # Tennis.
+            "Tennis": Box(float("-inf"), float("inf"), (27, )),
+            # VisualHallway.
+            "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)),
+            # Walker.
+            "Walker": Box(float("-inf"), float("inf"), (212, )),
        }
        action_spaces = {
-            # SoccerStrikersVsGoalie.
-            "Striker": MultiDiscrete([3, 3, 3]),
-            "Goalie": MultiDiscrete([3, 3, 3]),
            # 3DBall.
-            "Agent": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32),
+            "3DBall": Box(
+                float("-inf"), float("inf"), (2, ), dtype=np.float32),
+            # 3DBallHard.
+            "3DBallHard": Box(
+                float("-inf"), float("inf"), (2, ), dtype=np.float32),
+            # SoccerStrikersVsGoalie.
+            "Goalie": MultiDiscrete([3, 3, 3]),
+            "Striker": MultiDiscrete([3, 3, 3]),
+            # Tennis.
+            "Tennis": Box(float("-inf"), float("inf"), (3, )),
+            # VisualHallway.
+            "VisualHallway": MultiDiscrete([5]),
+            # Walker.
+            "Walker": Box(float("-inf"), float("inf"), (39, )),
        }

        # Policies (Unity: "behaviors") and agent-to-policy mapping fns.
        if game_name == "SoccerStrikersVsGoalie":
            policies = {
-                "Striker": (None, obs_spaces["Striker"],
-                            action_spaces["Striker"], {}),
                "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"],
                           {}),
+                "Striker": (None, obs_spaces["Striker"],
+                            action_spaces["Striker"], {}),
            }

            def policy_mapping_fn(agent_id):
                return "Striker" if "Striker" in agent_id else "Goalie"

-        else:  # 3DBall
+        else:
            policies = {
-                "Agent": (None, obs_spaces["Agent"], action_spaces["Agent"],
-                          {})
+                game_name: (None, obs_spaces[game_name],
+                            action_spaces[game_name], {}),
            }

            def policy_mapping_fn(agent_id):
-                return "Agent"
+                return game_name

        return policies, policy_mapping_fn
@@ -7,7 +7,8 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \
    _validate_multiagent_config
 from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \
    ShuffledInput
-from ray.rllib.utils import merge_dicts, try_import_tf
+from ray.rllib.utils import merge_dicts
+from ray.rllib.utils.framework import try_import_tf

 tf = try_import_tf()

@@ -226,7 +227,7 @@ class WorkerSet:
        else:
            input_evaluation = config["input_evaluation"]

-        # Fill in the default policy if 'None' is specified in multiagent
+        # Fill in the default policy if 'None' is specified in multiagent.
        if config["multiagent"]["policies"]:
            tmp = config["multiagent"]["policies"]
            _validate_multiagent_config(tmp, allow_none_graph=True)
@@ -10,7 +10,8 @@ To run this script against a local Unity3D engine:
 2) Open the Unity3D Editor and load an example scene from the following
   ml-agents pip package location:
   `.../ml-agents/Project/Assets/ML-Agents/Examples/`
-   This script supports the `3DBall` and `SoccerStrikersVsGoalie` examples.
+   This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`,
+    `Tennis`, and `Walker` examples.
   Specify the game you chose on your command line via e.g. `--env 3DBall`.
   Feel free to add more supported examples here.

@@ -31,13 +32,31 @@ parser.add_argument(
    "--env",
    type=str,
    default="3DBall",
-    choices=["3DBall", "SoccerStrikersVsGoalie"],
-    help="The name of the Env to run in the Unity3D editor. Either `3DBall` "
-    "or `SoccerStrikersVsGoalie` (feel free to add more to this script!)")
+    choices=[
+        "3DBall", "3DBallHard", "SoccerStrikersVsGoalie", "Tennis",
+        "VisualHallway", "Walker"
+    ],
+    help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|"
+    "SoccerStrikersVsGoalie|Tennis|VisualHallway|Walker` (feel free to add "
+    "more and PR!)")
+parser.add_argument(
+    "--file-name",
+    type=str,
+    default=None,
+    help="The Unity3d binary (compiled) game, e.g. "
+    "'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for "
+    "a currently running Unity3D editor.")
+parser.add_argument(
+    "--from-checkpoint",
+    type=str,
+    default=None,
+    help="Full path to a checkpoint file for restoring a previously saved "
+    "Trainer state.")
+parser.add_argument("--num-workers", type=int, default=0)
 parser.add_argument("--as-test", action="store_true")
-parser.add_argument("--stop-iters", type=int, default=150)
+parser.add_argument("--stop-iters", type=int, default=9999)
 parser.add_argument("--stop-reward", type=float, default=9999.0)
-parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-timesteps", type=int, default=10000000)
 parser.add_argument(
    "--horizon",
    type=int,
@@ -53,7 +72,9 @@ if __name__ == "__main__":

    tune.register_env(
        "unity3d",
-        lambda c: Unity3DEnv(episode_horizon=c.get("episode_horizon", 1000)))
+        lambda c: Unity3DEnv(
+            file_name=c["file_name"],
+            episode_horizon=c["episode_horizon"]))

    # Get policies (different agent types; "behaviors" in MLAgents) and
    # the mappings from individual agents to Policies.
@@ -63,20 +84,35 @@ if __name__ == "__main__":
    config = {
        "env": "unity3d",
        "env_config": {
+            "file_name": args.file_name,
            "episode_horizon": args.horizon,
        },
-        # IMPORTANT: Just use one Worker (we only have one Unity running)!
-        "num_workers": 0,
+        # For running in editor, force to use just one Worker (we only have
+        # one Unity running)!
+        "num_workers": args.num_workers if args.file_name else 0,
        # Other settings.
-        "sample_batch_size": 64,
-        "train_batch_size": 256,
-        "rollout_fragment_length": 20,
+        "lr": 0.0003,
+        "lambda": 0.95,
+        "gamma": 0.99,
+        "sgd_minibatch_size": 256,
+        "train_batch_size": 4000,
+        "num_sgd_iter": 20,
+        "rollout_fragment_length": 200,
+        "clip_param": 0.2,
        # Multi-agent setup for the particular env.
        "multiagent": {
            "policies": policies,
            "policy_mapping_fn": policy_mapping_fn,
        },
+        "model": {
+            "fcnet_hiddens": [512, 512],
+        },
        "framework": "tf",
+        "no_done_at_end": True,
+        # If no executable is provided (use Unity3D editor), do not evaluate,
+        # b/c the editor only allows one connection at a time.
+        "evaluation_interval": 10 if args.file_name else 0,
+        "evaluation_num_episodes": 1,
    }

    stop = {
@@ -86,7 +122,13 @@ if __name__ == "__main__":
    }

    # Run the experiment.
-    results = tune.run("PPO", config=config, stop=stop, verbose=1)
+    results = tune.run(
+        "PPO",
+        config=config,
+        stop=stop,
+        verbose=1,
+        checkpoint_freq=10,
+        restore=args.from_checkpoint)

    # And check the results.
    if args.as_test:
@@ -1,5 +1,5 @@
 repeat-after-me-ppo-w-lstm:
-    env: "ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv"
+    env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
    run: PPO
    stop:
        episode_reward_mean: 50
@@ -0,0 +1,46 @@
+# NOTE: This example will not run w/o a proper config.multiagent setup,
+#       which currently cannot be done in yaml.
+
+# This setup should learn a decent (not perfect) policy within 100k timesteps
+# on a single GPU machine (16 CPUS) using 10 workers (collecting data from
+# 10 compiled game binaries in parallel).
+# Reported rewards will be the sum of both strikers (+1 if goal) plus the
+# goalie's reward (-1 if goal) across all within-scene parallelized playing
+# fields (8 fields with each 2 strikers + 1 goalie, for the soccer env).
+unity3d-soccer-strikers-vs-goalie-ppo:
+    env: ray.rllib.env.unity3d_env.Unity3DEnv
+    run: PPO
+    stop:
+        timesteps_total: 1000000
+    config:
+        # NOTE: This example will not run w/o the following multiagent setup:
+        # Multi-agent setup for SoccerStrikersVsGoalie Unity3D Env.
+        # multiagent:
+        #    policies: [policies list]
+        #    policy_mapping_fn: [agent-to-policy mapping function]
+
+        # Works for both torch and tf.
+        framework: tf
+        env_config:
+          # Put the path to your compiled game executable here.
+          file_name: /home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64
+          # Timesteps after which a hard-reset will happen (all agents).
+          episode_horizon: 3000
+        lr: 0.0003
+        lambda: 0.95
+        gamma: 0.99
+        sgd_minibatch_size: 256
+        train_batch_size: 4000
+        clip_param: 0.2
+        # For running in editor, just use one Worker (we only have
+        # one Unity running)!
+        num_workers: 10
+        num_sgd_iter: 20
+        rollout_fragment_length: 200
+        no_done_at_end: true
+        model:
+          fcnet_hiddens: [512, 512]
+        # If no executable is provided (use Unity3D editor), do not evaluate,
+        # b/c the editor only allows one connection at a time.
+        evaluation_interval: 0
+        evaluation_num_episodes: 1