[RLlib] Unity3d soccer benchmarks (#8834)

This commit is contained in:
Sven Mika
2020-06-11 14:29:57 +02:00
committed by GitHub
parent 9166e22085
commit a90cd0fcbb
8 changed files with 162 additions and 51 deletions
+9 -1
View File
@@ -333,7 +333,15 @@ PPO's clipped objective supports multiple SGD passes over the same batch of expe
PPO architecture
Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__, `Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__, `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__, `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
Tuned examples:
`Unity3D Soccer (multi-agent: Strikers vs Goalie) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml>`__,
`Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__,
`Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__,
`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__,
`PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__,
`Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__,
`HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__,
`{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
**Atari results**: `more details <https://github.com/ray-project/rl-experiments>`__
+9 -15
View File
@@ -958,24 +958,18 @@ class Trainer(Trainable):
@staticmethod
def _validate_config(config: dict):
if "policy_graphs" in config["multiagent"]:
logger.warning(
"The `policy_graphs` config has been renamed to `policies`.")
# Backwards compatibility
config["multiagent"]["policies"] = config["multiagent"][
"policy_graphs"]
del config["multiagent"]["policy_graphs"]
deprecation_warning("policy_graphs", "policies")
# Backwards compatibility.
config["multiagent"]["policies"] = config["multiagent"].pop(
"policy_graphs")
if "gpu" in config:
raise ValueError(
"The `gpu` config is deprecated, please use `num_gpus=0|1` "
"instead.")
deprecation_warning("gpu", "num_gpus=0|1", error=True)
if "gpu_fraction" in config:
raise ValueError(
"The `gpu_fraction` config is deprecated, please use "
"`num_gpus=<fraction>` instead.")
deprecation_warning(
"gpu_fraction", "num_gpus=<fraction>", error=True)
if "use_gpu_for_workers" in config:
raise ValueError(
"The `use_gpu_for_workers` config is deprecated, please use "
"`num_gpus_per_worker=1` instead.")
deprecation_warning(
"use_gpu_for_workers", "num_gpus_per_worker=1", error=True)
if type(config["input_evaluation"]) != list:
raise ValueError(
"`input_evaluation` must be a list of strings, got {}".format(
+4 -5
View File
@@ -14,11 +14,10 @@ DEFAULT_CONFIG_LINEAR = {
class LinearDiscreteEnv(gym.Env):
"""Samples data from linearly parameterized arms.
The reward for context X and arm i is given by X^T * theta_i, for some
latent set of parameters {theta_i : i = 1, ..., k}. The thetas are sampled
uniformly at random, the contexts are Gaussian, and Gaussian noise is
added to the rewards.
The reward for context X and arm i is given by X^T * theta_i, for some
latent set of parameters {theta_i : i = 1, ..., k}.
The thetas are sampled uniformly at random, the contexts are Gaussian,
and Gaussian noise is added to the rewards.
"""
def __init__(self, config=None):
+35 -14
View File
@@ -13,7 +13,10 @@ logger = logging.getLogger(__name__)
class Unity3DEnv(MultiAgentEnv):
"""A MultiAgentEnv representing a single Unity3D game instance.
For an example on how to use this class inside a Unity game client, which
For an example on how to use this Env with a running Unity3D editor
or with a compiled game, see:
`rllib/examples/unity3d_env_local.py`
For an example on how to use it inside a Unity game client, which
connects to an RLlib Policy server, see:
`rllib/examples/serving/unity3d_[client|server].py`
@@ -191,42 +194,60 @@ class Unity3DEnv(MultiAgentEnv):
# The RLlib server must know about the Spaces that the Client will be
# using inside Unity3D, up-front.
obs_spaces = {
# 3DBall.
"3DBall": Box(float("-inf"), float("inf"), (8, )),
# 3DBallHard.
"3DBallHard": Box(float("-inf"), float("inf"), (45, )),
# SoccerStrikersVsGoalie.
"Goalie": Box(float("-inf"), float("inf"), (738, )),
"Striker": Tuple([
Box(float("-inf"), float("inf"), (231, )),
Box(float("-inf"), float("inf"), (63, )),
]),
"Goalie": Box(float("-inf"), float("inf"), (738, )),
# 3DBall.
"Agent": Box(float("-inf"), float("inf"), (8, )),
# Tennis.
"Tennis": Box(float("-inf"), float("inf"), (27, )),
# VisualHallway.
"VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)),
# Walker.
"Walker": Box(float("-inf"), float("inf"), (212, )),
}
action_spaces = {
# SoccerStrikersVsGoalie.
"Striker": MultiDiscrete([3, 3, 3]),
"Goalie": MultiDiscrete([3, 3, 3]),
# 3DBall.
"Agent": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32),
"3DBall": Box(
float("-inf"), float("inf"), (2, ), dtype=np.float32),
# 3DBallHard.
"3DBallHard": Box(
float("-inf"), float("inf"), (2, ), dtype=np.float32),
# SoccerStrikersVsGoalie.
"Goalie": MultiDiscrete([3, 3, 3]),
"Striker": MultiDiscrete([3, 3, 3]),
# Tennis.
"Tennis": Box(float("-inf"), float("inf"), (3, )),
# VisualHallway.
"VisualHallway": MultiDiscrete([5]),
# Walker.
"Walker": Box(float("-inf"), float("inf"), (39, )),
}
# Policies (Unity: "behaviors") and agent-to-policy mapping fns.
if game_name == "SoccerStrikersVsGoalie":
policies = {
"Striker": (None, obs_spaces["Striker"],
action_spaces["Striker"], {}),
"Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"],
{}),
"Striker": (None, obs_spaces["Striker"],
action_spaces["Striker"], {}),
}
def policy_mapping_fn(agent_id):
return "Striker" if "Striker" in agent_id else "Goalie"
else: # 3DBall
else:
policies = {
"Agent": (None, obs_spaces["Agent"], action_spaces["Agent"],
{})
game_name: (None, obs_spaces[game_name],
action_spaces[game_name], {}),
}
def policy_mapping_fn(agent_id):
return "Agent"
return game_name
return policies, policy_mapping_fn
+3 -2
View File
@@ -7,7 +7,8 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \
_validate_multiagent_config
from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \
ShuffledInput
from ray.rllib.utils import merge_dicts, try_import_tf
from ray.rllib.utils import merge_dicts
from ray.rllib.utils.framework import try_import_tf
tf = try_import_tf()
@@ -226,7 +227,7 @@ class WorkerSet:
else:
input_evaluation = config["input_evaluation"]
# Fill in the default policy if 'None' is specified in multiagent
# Fill in the default policy if 'None' is specified in multiagent.
if config["multiagent"]["policies"]:
tmp = config["multiagent"]["policies"]
_validate_multiagent_config(tmp, allow_none_graph=True)
+55 -13
View File
@@ -10,7 +10,8 @@ To run this script against a local Unity3D engine:
2) Open the Unity3D Editor and load an example scene from the following
ml-agents pip package location:
`.../ml-agents/Project/Assets/ML-Agents/Examples/`
This script supports the `3DBall` and `SoccerStrikersVsGoalie` examples.
This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`,
`Tennis`, and `Walker` examples.
Specify the game you chose on your command line via e.g. `--env 3DBall`.
Feel free to add more supported examples here.
@@ -31,13 +32,31 @@ parser.add_argument(
"--env",
type=str,
default="3DBall",
choices=["3DBall", "SoccerStrikersVsGoalie"],
help="The name of the Env to run in the Unity3D editor. Either `3DBall` "
"or `SoccerStrikersVsGoalie` (feel free to add more to this script!)")
choices=[
"3DBall", "3DBallHard", "SoccerStrikersVsGoalie", "Tennis",
"VisualHallway", "Walker"
],
help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|"
"SoccerStrikersVsGoalie|Tennis|VisualHallway|Walker` (feel free to add "
"more and PR!)")
parser.add_argument(
"--file-name",
type=str,
default=None,
help="The Unity3d binary (compiled) game, e.g. "
"'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for "
"a currently running Unity3D editor.")
parser.add_argument(
"--from-checkpoint",
type=str,
default=None,
help="Full path to a checkpoint file for restoring a previously saved "
"Trainer state.")
parser.add_argument("--num-workers", type=int, default=0)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=150)
parser.add_argument("--stop-iters", type=int, default=9999)
parser.add_argument("--stop-reward", type=float, default=9999.0)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-timesteps", type=int, default=10000000)
parser.add_argument(
"--horizon",
type=int,
@@ -53,7 +72,9 @@ if __name__ == "__main__":
tune.register_env(
"unity3d",
lambda c: Unity3DEnv(episode_horizon=c.get("episode_horizon", 1000)))
lambda c: Unity3DEnv(
file_name=c["file_name"],
episode_horizon=c["episode_horizon"]))
# Get policies (different agent types; "behaviors" in MLAgents) and
# the mappings from individual agents to Policies.
@@ -63,20 +84,35 @@ if __name__ == "__main__":
config = {
"env": "unity3d",
"env_config": {
"file_name": args.file_name,
"episode_horizon": args.horizon,
},
# IMPORTANT: Just use one Worker (we only have one Unity running)!
"num_workers": 0,
# For running in editor, force to use just one Worker (we only have
# one Unity running)!
"num_workers": args.num_workers if args.file_name else 0,
# Other settings.
"sample_batch_size": 64,
"train_batch_size": 256,
"rollout_fragment_length": 20,
"lr": 0.0003,
"lambda": 0.95,
"gamma": 0.99,
"sgd_minibatch_size": 256,
"train_batch_size": 4000,
"num_sgd_iter": 20,
"rollout_fragment_length": 200,
"clip_param": 0.2,
# Multi-agent setup for the particular env.
"multiagent": {
"policies": policies,
"policy_mapping_fn": policy_mapping_fn,
},
"model": {
"fcnet_hiddens": [512, 512],
},
"framework": "tf",
"no_done_at_end": True,
# If no executable is provided (use Unity3D editor), do not evaluate,
# b/c the editor only allows one connection at a time.
"evaluation_interval": 10 if args.file_name else 0,
"evaluation_num_episodes": 1,
}
stop = {
@@ -86,7 +122,13 @@ if __name__ == "__main__":
}
# Run the experiment.
results = tune.run("PPO", config=config, stop=stop, verbose=1)
results = tune.run(
"PPO",
config=config,
stop=stop,
verbose=1,
checkpoint_freq=10,
restore=args.from_checkpoint)
# And check the results.
if args.as_test:
@@ -1,5 +1,5 @@
repeat-after-me-ppo-w-lstm:
env: "ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv"
env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
run: PPO
stop:
episode_reward_mean: 50
@@ -0,0 +1,46 @@
# NOTE: This example will not run w/o a proper config.multiagent setup,
# which currently cannot be done in yaml.
# This setup should learn a decent (not perfect) policy within 100k timesteps
# on a single GPU machine (16 CPUS) using 10 workers (collecting data from
# 10 compiled game binaries in parallel).
# Reported rewards will be the sum of both strikers (+1 if goal) plus the
# goalie's reward (-1 if goal) across all within-scene parallelized playing
# fields (8 fields with each 2 strikers + 1 goalie, for the soccer env).
unity3d-soccer-strikers-vs-goalie-ppo:
env: ray.rllib.env.unity3d_env.Unity3DEnv
run: PPO
stop:
timesteps_total: 1000000
config:
# NOTE: This example will not run w/o the following multiagent setup:
# Multi-agent setup for SoccerStrikersVsGoalie Unity3D Env.
# multiagent:
# policies: [policies list]
# policy_mapping_fn: [agent-to-policy mapping function]
# Works for both torch and tf.
framework: tf
env_config:
# Put the path to your compiled game executable here.
file_name: /home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64
# Timesteps after which a hard-reset will happen (all agents).
episode_horizon: 3000
lr: 0.0003
lambda: 0.95
gamma: 0.99
sgd_minibatch_size: 256
train_batch_size: 4000
clip_param: 0.2
# For running in editor, just use one Worker (we only have
# one Unity running)!
num_workers: 10
num_sgd_iter: 20
rollout_fragment_length: 200
no_done_at_end: true
model:
fcnet_hiddens: [512, 512]
# If no executable is provided (use Unity3D editor), do not evaluate,
# b/c the editor only allows one connection at a time.
evaluation_interval: 0
evaluation_num_episodes: 1