From a90cd0fcbb81c2be472b16c61bebc75125d16133 Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Thu, 11 Jun 2020 14:29:57 +0200 Subject: [PATCH] [RLlib] Unity3d soccer benchmarks (#8834) --- doc/source/rllib-algorithms.rst | 10 ++- rllib/agents/trainer.py | 24 +++---- rllib/contrib/bandits/envs/discrete.py | 9 ++- rllib/env/unity3d_env.py | 49 +++++++++---- rllib/evaluation/worker_set.py | 5 +- rllib/examples/unity3d_env_local.py | 68 +++++++++++++++---- .../ppo/repeatafterme-ppo-lstm.yaml | 2 +- ...unity3d-soccer-strikers-vs-goalie-ppo.yaml | 46 +++++++++++++ 8 files changed, 162 insertions(+), 51 deletions(-) create mode 100644 rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml diff --git a/doc/source/rllib-algorithms.rst b/doc/source/rllib-algorithms.rst index a47399122..b4880cfc2 100644 --- a/doc/source/rllib-algorithms.rst +++ b/doc/source/rllib-algorithms.rst @@ -333,7 +333,15 @@ PPO's clipped objective supports multiple SGD passes over the same batch of expe PPO architecture -Tuned examples: `Humanoid-v1 `__, `Hopper-v1 `__, `Pendulum-v0 `__, `PongDeterministic-v4 `__, `Walker2d-v1 `__, `HalfCheetah-v2 `__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 `__ +Tuned examples: +`Unity3D Soccer (multi-agent: Strikers vs Goalie) `__, +`Humanoid-v1 `__, +`Hopper-v1 `__, +`Pendulum-v0 `__, +`PongDeterministic-v4 `__, +`Walker2d-v1 `__, +`HalfCheetah-v2 `__, +`{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 `__ **Atari results**: `more details `__ diff --git a/rllib/agents/trainer.py b/rllib/agents/trainer.py index 467ff8bff..b578f3ec4 100644 --- a/rllib/agents/trainer.py +++ b/rllib/agents/trainer.py @@ -958,24 +958,18 @@ class Trainer(Trainable): @staticmethod def _validate_config(config: dict): if "policy_graphs" in config["multiagent"]: - logger.warning( - "The `policy_graphs` config has been renamed to `policies`.") - # Backwards compatibility - config["multiagent"]["policies"] = config["multiagent"][ - "policy_graphs"] - del config["multiagent"]["policy_graphs"] + deprecation_warning("policy_graphs", "policies") + # Backwards compatibility. + config["multiagent"]["policies"] = config["multiagent"].pop( + "policy_graphs") if "gpu" in config: - raise ValueError( - "The `gpu` config is deprecated, please use `num_gpus=0|1` " - "instead.") + deprecation_warning("gpu", "num_gpus=0|1", error=True) if "gpu_fraction" in config: - raise ValueError( - "The `gpu_fraction` config is deprecated, please use " - "`num_gpus=` instead.") + deprecation_warning( + "gpu_fraction", "num_gpus=", error=True) if "use_gpu_for_workers" in config: - raise ValueError( - "The `use_gpu_for_workers` config is deprecated, please use " - "`num_gpus_per_worker=1` instead.") + deprecation_warning( + "use_gpu_for_workers", "num_gpus_per_worker=1", error=True) if type(config["input_evaluation"]) != list: raise ValueError( "`input_evaluation` must be a list of strings, got {}".format( diff --git a/rllib/contrib/bandits/envs/discrete.py b/rllib/contrib/bandits/envs/discrete.py index 5342df3b3..540bd46ff 100644 --- a/rllib/contrib/bandits/envs/discrete.py +++ b/rllib/contrib/bandits/envs/discrete.py @@ -14,11 +14,10 @@ DEFAULT_CONFIG_LINEAR = { class LinearDiscreteEnv(gym.Env): """Samples data from linearly parameterized arms. - The reward for context X and arm i is given by X^T * theta_i, for some - latent set of parameters {theta_i : i = 1, ..., k}. The thetas are sampled - uniformly at random, the contexts are Gaussian, and Gaussian noise is - added to the rewards. - + The reward for context X and arm i is given by X^T * theta_i, for some + latent set of parameters {theta_i : i = 1, ..., k}. + The thetas are sampled uniformly at random, the contexts are Gaussian, + and Gaussian noise is added to the rewards. """ def __init__(self, config=None): diff --git a/rllib/env/unity3d_env.py b/rllib/env/unity3d_env.py index c66a6c14c..66c7337d0 100644 --- a/rllib/env/unity3d_env.py +++ b/rllib/env/unity3d_env.py @@ -13,7 +13,10 @@ logger = logging.getLogger(__name__) class Unity3DEnv(MultiAgentEnv): """A MultiAgentEnv representing a single Unity3D game instance. - For an example on how to use this class inside a Unity game client, which + For an example on how to use this Env with a running Unity3D editor + or with a compiled game, see: + `rllib/examples/unity3d_env_local.py` + For an example on how to use it inside a Unity game client, which connects to an RLlib Policy server, see: `rllib/examples/serving/unity3d_[client|server].py` @@ -191,42 +194,60 @@ class Unity3DEnv(MultiAgentEnv): # The RLlib server must know about the Spaces that the Client will be # using inside Unity3D, up-front. obs_spaces = { + # 3DBall. + "3DBall": Box(float("-inf"), float("inf"), (8, )), + # 3DBallHard. + "3DBallHard": Box(float("-inf"), float("inf"), (45, )), # SoccerStrikersVsGoalie. + "Goalie": Box(float("-inf"), float("inf"), (738, )), "Striker": Tuple([ Box(float("-inf"), float("inf"), (231, )), Box(float("-inf"), float("inf"), (63, )), ]), - "Goalie": Box(float("-inf"), float("inf"), (738, )), - # 3DBall. - "Agent": Box(float("-inf"), float("inf"), (8, )), + # Tennis. + "Tennis": Box(float("-inf"), float("inf"), (27, )), + # VisualHallway. + "VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)), + # Walker. + "Walker": Box(float("-inf"), float("inf"), (212, )), } action_spaces = { - # SoccerStrikersVsGoalie. - "Striker": MultiDiscrete([3, 3, 3]), - "Goalie": MultiDiscrete([3, 3, 3]), # 3DBall. - "Agent": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32), + "3DBall": Box( + float("-inf"), float("inf"), (2, ), dtype=np.float32), + # 3DBallHard. + "3DBallHard": Box( + float("-inf"), float("inf"), (2, ), dtype=np.float32), + # SoccerStrikersVsGoalie. + "Goalie": MultiDiscrete([3, 3, 3]), + "Striker": MultiDiscrete([3, 3, 3]), + # Tennis. + "Tennis": Box(float("-inf"), float("inf"), (3, )), + # VisualHallway. + "VisualHallway": MultiDiscrete([5]), + # Walker. + "Walker": Box(float("-inf"), float("inf"), (39, )), } # Policies (Unity: "behaviors") and agent-to-policy mapping fns. if game_name == "SoccerStrikersVsGoalie": policies = { - "Striker": (None, obs_spaces["Striker"], - action_spaces["Striker"], {}), "Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"], {}), + "Striker": (None, obs_spaces["Striker"], + action_spaces["Striker"], {}), } def policy_mapping_fn(agent_id): return "Striker" if "Striker" in agent_id else "Goalie" - else: # 3DBall + else: policies = { - "Agent": (None, obs_spaces["Agent"], action_spaces["Agent"], - {}) + game_name: (None, obs_spaces[game_name], + action_spaces[game_name], {}), } def policy_mapping_fn(agent_id): - return "Agent" + return game_name return policies, policy_mapping_fn diff --git a/rllib/evaluation/worker_set.py b/rllib/evaluation/worker_set.py index 01198d5a1..5e52a0fc1 100644 --- a/rllib/evaluation/worker_set.py +++ b/rllib/evaluation/worker_set.py @@ -7,7 +7,8 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \ _validate_multiagent_config from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \ ShuffledInput -from ray.rllib.utils import merge_dicts, try_import_tf +from ray.rllib.utils import merge_dicts +from ray.rllib.utils.framework import try_import_tf tf = try_import_tf() @@ -226,7 +227,7 @@ class WorkerSet: else: input_evaluation = config["input_evaluation"] - # Fill in the default policy if 'None' is specified in multiagent + # Fill in the default policy if 'None' is specified in multiagent. if config["multiagent"]["policies"]: tmp = config["multiagent"]["policies"] _validate_multiagent_config(tmp, allow_none_graph=True) diff --git a/rllib/examples/unity3d_env_local.py b/rllib/examples/unity3d_env_local.py index 380eb527e..83b5bbb04 100644 --- a/rllib/examples/unity3d_env_local.py +++ b/rllib/examples/unity3d_env_local.py @@ -10,7 +10,8 @@ To run this script against a local Unity3D engine: 2) Open the Unity3D Editor and load an example scene from the following ml-agents pip package location: `.../ml-agents/Project/Assets/ML-Agents/Examples/` - This script supports the `3DBall` and `SoccerStrikersVsGoalie` examples. + This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`, + `Tennis`, and `Walker` examples. Specify the game you chose on your command line via e.g. `--env 3DBall`. Feel free to add more supported examples here. @@ -31,13 +32,31 @@ parser.add_argument( "--env", type=str, default="3DBall", - choices=["3DBall", "SoccerStrikersVsGoalie"], - help="The name of the Env to run in the Unity3D editor. Either `3DBall` " - "or `SoccerStrikersVsGoalie` (feel free to add more to this script!)") + choices=[ + "3DBall", "3DBallHard", "SoccerStrikersVsGoalie", "Tennis", + "VisualHallway", "Walker" + ], + help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|" + "SoccerStrikersVsGoalie|Tennis|VisualHallway|Walker` (feel free to add " + "more and PR!)") +parser.add_argument( + "--file-name", + type=str, + default=None, + help="The Unity3d binary (compiled) game, e.g. " + "'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for " + "a currently running Unity3D editor.") +parser.add_argument( + "--from-checkpoint", + type=str, + default=None, + help="Full path to a checkpoint file for restoring a previously saved " + "Trainer state.") +parser.add_argument("--num-workers", type=int, default=0) parser.add_argument("--as-test", action="store_true") -parser.add_argument("--stop-iters", type=int, default=150) +parser.add_argument("--stop-iters", type=int, default=9999) parser.add_argument("--stop-reward", type=float, default=9999.0) -parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-timesteps", type=int, default=10000000) parser.add_argument( "--horizon", type=int, @@ -53,7 +72,9 @@ if __name__ == "__main__": tune.register_env( "unity3d", - lambda c: Unity3DEnv(episode_horizon=c.get("episode_horizon", 1000))) + lambda c: Unity3DEnv( + file_name=c["file_name"], + episode_horizon=c["episode_horizon"])) # Get policies (different agent types; "behaviors" in MLAgents) and # the mappings from individual agents to Policies. @@ -63,20 +84,35 @@ if __name__ == "__main__": config = { "env": "unity3d", "env_config": { + "file_name": args.file_name, "episode_horizon": args.horizon, }, - # IMPORTANT: Just use one Worker (we only have one Unity running)! - "num_workers": 0, + # For running in editor, force to use just one Worker (we only have + # one Unity running)! + "num_workers": args.num_workers if args.file_name else 0, # Other settings. - "sample_batch_size": 64, - "train_batch_size": 256, - "rollout_fragment_length": 20, + "lr": 0.0003, + "lambda": 0.95, + "gamma": 0.99, + "sgd_minibatch_size": 256, + "train_batch_size": 4000, + "num_sgd_iter": 20, + "rollout_fragment_length": 200, + "clip_param": 0.2, # Multi-agent setup for the particular env. "multiagent": { "policies": policies, "policy_mapping_fn": policy_mapping_fn, }, + "model": { + "fcnet_hiddens": [512, 512], + }, "framework": "tf", + "no_done_at_end": True, + # If no executable is provided (use Unity3D editor), do not evaluate, + # b/c the editor only allows one connection at a time. + "evaluation_interval": 10 if args.file_name else 0, + "evaluation_num_episodes": 1, } stop = { @@ -86,7 +122,13 @@ if __name__ == "__main__": } # Run the experiment. - results = tune.run("PPO", config=config, stop=stop, verbose=1) + results = tune.run( + "PPO", + config=config, + stop=stop, + verbose=1, + checkpoint_freq=10, + restore=args.from_checkpoint) # And check the results. if args.as_test: diff --git a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml index e737cb09d..700e420d7 100644 --- a/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml +++ b/rllib/tuned_examples/ppo/repeatafterme-ppo-lstm.yaml @@ -1,5 +1,5 @@ repeat-after-me-ppo-w-lstm: - env: "ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv" + env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv run: PPO stop: episode_reward_mean: 50 diff --git a/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml new file mode 100644 index 000000000..5a089f5a6 --- /dev/null +++ b/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml @@ -0,0 +1,46 @@ +# NOTE: This example will not run w/o a proper config.multiagent setup, +# which currently cannot be done in yaml. + +# This setup should learn a decent (not perfect) policy within 100k timesteps +# on a single GPU machine (16 CPUS) using 10 workers (collecting data from +# 10 compiled game binaries in parallel). +# Reported rewards will be the sum of both strikers (+1 if goal) plus the +# goalie's reward (-1 if goal) across all within-scene parallelized playing +# fields (8 fields with each 2 strikers + 1 goalie, for the soccer env). +unity3d-soccer-strikers-vs-goalie-ppo: + env: ray.rllib.env.unity3d_env.Unity3DEnv + run: PPO + stop: + timesteps_total: 1000000 + config: + # NOTE: This example will not run w/o the following multiagent setup: + # Multi-agent setup for SoccerStrikersVsGoalie Unity3D Env. + # multiagent: + # policies: [policies list] + # policy_mapping_fn: [agent-to-policy mapping function] + + # Works for both torch and tf. + framework: tf + env_config: + # Put the path to your compiled game executable here. + file_name: /home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64 + # Timesteps after which a hard-reset will happen (all agents). + episode_horizon: 3000 + lr: 0.0003 + lambda: 0.95 + gamma: 0.99 + sgd_minibatch_size: 256 + train_batch_size: 4000 + clip_param: 0.2 + # For running in editor, just use one Worker (we only have + # one Unity running)! + num_workers: 10 + num_sgd_iter: 20 + rollout_fragment_length: 200 + no_done_at_end: true + model: + fcnet_hiddens: [512, 512] + # If no executable is provided (use Unity3D editor), do not evaluate, + # b/c the editor only allows one connection at a time. + evaluation_interval: 0 + evaluation_num_episodes: 1