mirror of
https://github.com/wassname/ray.git
synced 2026-07-03 12:03:10 +08:00
[RLlib] Unity3d soccer benchmarks (#8834)
This commit is contained in:
@@ -333,7 +333,15 @@ PPO's clipped objective supports multiple SGD passes over the same batch of expe
|
||||
|
||||
PPO architecture
|
||||
|
||||
Tuned examples: `Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__, `Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__, `Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__, `PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__, `Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__, `HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__, `{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
|
||||
Tuned examples:
|
||||
`Unity3D Soccer (multi-agent: Strikers vs Goalie) <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/unity3d-soccer-strikers-vs-goalie-ppo.yaml>`__,
|
||||
`Humanoid-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/humanoid-ppo-gae.yaml>`__,
|
||||
`Hopper-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/hopper-ppo.yaml>`__,
|
||||
`Pendulum-v0 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pendulum-ppo.yaml>`__,
|
||||
`PongDeterministic-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/pong-ppo.yaml>`__,
|
||||
`Walker2d-v1 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/walker2d-ppo.yaml>`__,
|
||||
`HalfCheetah-v2 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/halfcheetah-ppo.yaml>`__,
|
||||
`{BeamRider,Breakout,Qbert,SpaceInvaders}NoFrameskip-v4 <https://github.com/ray-project/ray/blob/master/rllib/tuned_examples/ppo/atari-ppo.yaml>`__
|
||||
|
||||
|
||||
**Atari results**: `more details <https://github.com/ray-project/rl-experiments>`__
|
||||
|
||||
+9
-15
@@ -958,24 +958,18 @@ class Trainer(Trainable):
|
||||
@staticmethod
|
||||
def _validate_config(config: dict):
|
||||
if "policy_graphs" in config["multiagent"]:
|
||||
logger.warning(
|
||||
"The `policy_graphs` config has been renamed to `policies`.")
|
||||
# Backwards compatibility
|
||||
config["multiagent"]["policies"] = config["multiagent"][
|
||||
"policy_graphs"]
|
||||
del config["multiagent"]["policy_graphs"]
|
||||
deprecation_warning("policy_graphs", "policies")
|
||||
# Backwards compatibility.
|
||||
config["multiagent"]["policies"] = config["multiagent"].pop(
|
||||
"policy_graphs")
|
||||
if "gpu" in config:
|
||||
raise ValueError(
|
||||
"The `gpu` config is deprecated, please use `num_gpus=0|1` "
|
||||
"instead.")
|
||||
deprecation_warning("gpu", "num_gpus=0|1", error=True)
|
||||
if "gpu_fraction" in config:
|
||||
raise ValueError(
|
||||
"The `gpu_fraction` config is deprecated, please use "
|
||||
"`num_gpus=<fraction>` instead.")
|
||||
deprecation_warning(
|
||||
"gpu_fraction", "num_gpus=<fraction>", error=True)
|
||||
if "use_gpu_for_workers" in config:
|
||||
raise ValueError(
|
||||
"The `use_gpu_for_workers` config is deprecated, please use "
|
||||
"`num_gpus_per_worker=1` instead.")
|
||||
deprecation_warning(
|
||||
"use_gpu_for_workers", "num_gpus_per_worker=1", error=True)
|
||||
if type(config["input_evaluation"]) != list:
|
||||
raise ValueError(
|
||||
"`input_evaluation` must be a list of strings, got {}".format(
|
||||
|
||||
@@ -14,11 +14,10 @@ DEFAULT_CONFIG_LINEAR = {
|
||||
class LinearDiscreteEnv(gym.Env):
|
||||
"""Samples data from linearly parameterized arms.
|
||||
|
||||
The reward for context X and arm i is given by X^T * theta_i, for some
|
||||
latent set of parameters {theta_i : i = 1, ..., k}. The thetas are sampled
|
||||
uniformly at random, the contexts are Gaussian, and Gaussian noise is
|
||||
added to the rewards.
|
||||
|
||||
The reward for context X and arm i is given by X^T * theta_i, for some
|
||||
latent set of parameters {theta_i : i = 1, ..., k}.
|
||||
The thetas are sampled uniformly at random, the contexts are Gaussian,
|
||||
and Gaussian noise is added to the rewards.
|
||||
"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
|
||||
Vendored
+35
-14
@@ -13,7 +13,10 @@ logger = logging.getLogger(__name__)
|
||||
class Unity3DEnv(MultiAgentEnv):
|
||||
"""A MultiAgentEnv representing a single Unity3D game instance.
|
||||
|
||||
For an example on how to use this class inside a Unity game client, which
|
||||
For an example on how to use this Env with a running Unity3D editor
|
||||
or with a compiled game, see:
|
||||
`rllib/examples/unity3d_env_local.py`
|
||||
For an example on how to use it inside a Unity game client, which
|
||||
connects to an RLlib Policy server, see:
|
||||
`rllib/examples/serving/unity3d_[client|server].py`
|
||||
|
||||
@@ -191,42 +194,60 @@ class Unity3DEnv(MultiAgentEnv):
|
||||
# The RLlib server must know about the Spaces that the Client will be
|
||||
# using inside Unity3D, up-front.
|
||||
obs_spaces = {
|
||||
# 3DBall.
|
||||
"3DBall": Box(float("-inf"), float("inf"), (8, )),
|
||||
# 3DBallHard.
|
||||
"3DBallHard": Box(float("-inf"), float("inf"), (45, )),
|
||||
# SoccerStrikersVsGoalie.
|
||||
"Goalie": Box(float("-inf"), float("inf"), (738, )),
|
||||
"Striker": Tuple([
|
||||
Box(float("-inf"), float("inf"), (231, )),
|
||||
Box(float("-inf"), float("inf"), (63, )),
|
||||
]),
|
||||
"Goalie": Box(float("-inf"), float("inf"), (738, )),
|
||||
# 3DBall.
|
||||
"Agent": Box(float("-inf"), float("inf"), (8, )),
|
||||
# Tennis.
|
||||
"Tennis": Box(float("-inf"), float("inf"), (27, )),
|
||||
# VisualHallway.
|
||||
"VisualHallway": Box(float("-inf"), float("inf"), (84, 84, 3)),
|
||||
# Walker.
|
||||
"Walker": Box(float("-inf"), float("inf"), (212, )),
|
||||
}
|
||||
action_spaces = {
|
||||
# SoccerStrikersVsGoalie.
|
||||
"Striker": MultiDiscrete([3, 3, 3]),
|
||||
"Goalie": MultiDiscrete([3, 3, 3]),
|
||||
# 3DBall.
|
||||
"Agent": Box(float("-inf"), float("inf"), (2, ), dtype=np.float32),
|
||||
"3DBall": Box(
|
||||
float("-inf"), float("inf"), (2, ), dtype=np.float32),
|
||||
# 3DBallHard.
|
||||
"3DBallHard": Box(
|
||||
float("-inf"), float("inf"), (2, ), dtype=np.float32),
|
||||
# SoccerStrikersVsGoalie.
|
||||
"Goalie": MultiDiscrete([3, 3, 3]),
|
||||
"Striker": MultiDiscrete([3, 3, 3]),
|
||||
# Tennis.
|
||||
"Tennis": Box(float("-inf"), float("inf"), (3, )),
|
||||
# VisualHallway.
|
||||
"VisualHallway": MultiDiscrete([5]),
|
||||
# Walker.
|
||||
"Walker": Box(float("-inf"), float("inf"), (39, )),
|
||||
}
|
||||
|
||||
# Policies (Unity: "behaviors") and agent-to-policy mapping fns.
|
||||
if game_name == "SoccerStrikersVsGoalie":
|
||||
policies = {
|
||||
"Striker": (None, obs_spaces["Striker"],
|
||||
action_spaces["Striker"], {}),
|
||||
"Goalie": (None, obs_spaces["Goalie"], action_spaces["Goalie"],
|
||||
{}),
|
||||
"Striker": (None, obs_spaces["Striker"],
|
||||
action_spaces["Striker"], {}),
|
||||
}
|
||||
|
||||
def policy_mapping_fn(agent_id):
|
||||
return "Striker" if "Striker" in agent_id else "Goalie"
|
||||
|
||||
else: # 3DBall
|
||||
else:
|
||||
policies = {
|
||||
"Agent": (None, obs_spaces["Agent"], action_spaces["Agent"],
|
||||
{})
|
||||
game_name: (None, obs_spaces[game_name],
|
||||
action_spaces[game_name], {}),
|
||||
}
|
||||
|
||||
def policy_mapping_fn(agent_id):
|
||||
return "Agent"
|
||||
return game_name
|
||||
|
||||
return policies, policy_mapping_fn
|
||||
|
||||
@@ -7,7 +7,8 @@ from ray.rllib.evaluation.rollout_worker import RolloutWorker, \
|
||||
_validate_multiagent_config
|
||||
from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \
|
||||
ShuffledInput
|
||||
from ray.rllib.utils import merge_dicts, try_import_tf
|
||||
from ray.rllib.utils import merge_dicts
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
@@ -226,7 +227,7 @@ class WorkerSet:
|
||||
else:
|
||||
input_evaluation = config["input_evaluation"]
|
||||
|
||||
# Fill in the default policy if 'None' is specified in multiagent
|
||||
# Fill in the default policy if 'None' is specified in multiagent.
|
||||
if config["multiagent"]["policies"]:
|
||||
tmp = config["multiagent"]["policies"]
|
||||
_validate_multiagent_config(tmp, allow_none_graph=True)
|
||||
|
||||
@@ -10,7 +10,8 @@ To run this script against a local Unity3D engine:
|
||||
2) Open the Unity3D Editor and load an example scene from the following
|
||||
ml-agents pip package location:
|
||||
`.../ml-agents/Project/Assets/ML-Agents/Examples/`
|
||||
This script supports the `3DBall` and `SoccerStrikersVsGoalie` examples.
|
||||
This script supports the `3DBall`, `3DBallHard`, `SoccerStrikersVsGoalie`,
|
||||
`Tennis`, and `Walker` examples.
|
||||
Specify the game you chose on your command line via e.g. `--env 3DBall`.
|
||||
Feel free to add more supported examples here.
|
||||
|
||||
@@ -31,13 +32,31 @@ parser.add_argument(
|
||||
"--env",
|
||||
type=str,
|
||||
default="3DBall",
|
||||
choices=["3DBall", "SoccerStrikersVsGoalie"],
|
||||
help="The name of the Env to run in the Unity3D editor. Either `3DBall` "
|
||||
"or `SoccerStrikersVsGoalie` (feel free to add more to this script!)")
|
||||
choices=[
|
||||
"3DBall", "3DBallHard", "SoccerStrikersVsGoalie", "Tennis",
|
||||
"VisualHallway", "Walker"
|
||||
],
|
||||
help="The name of the Env to run in the Unity3D editor: `3DBall(Hard)?|"
|
||||
"SoccerStrikersVsGoalie|Tennis|VisualHallway|Walker` (feel free to add "
|
||||
"more and PR!)")
|
||||
parser.add_argument(
|
||||
"--file-name",
|
||||
type=str,
|
||||
default=None,
|
||||
help="The Unity3d binary (compiled) game, e.g. "
|
||||
"'/home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64'. Use `None` for "
|
||||
"a currently running Unity3D editor.")
|
||||
parser.add_argument(
|
||||
"--from-checkpoint",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Full path to a checkpoint file for restoring a previously saved "
|
||||
"Trainer state.")
|
||||
parser.add_argument("--num-workers", type=int, default=0)
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=150)
|
||||
parser.add_argument("--stop-iters", type=int, default=9999)
|
||||
parser.add_argument("--stop-reward", type=float, default=9999.0)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=10000000)
|
||||
parser.add_argument(
|
||||
"--horizon",
|
||||
type=int,
|
||||
@@ -53,7 +72,9 @@ if __name__ == "__main__":
|
||||
|
||||
tune.register_env(
|
||||
"unity3d",
|
||||
lambda c: Unity3DEnv(episode_horizon=c.get("episode_horizon", 1000)))
|
||||
lambda c: Unity3DEnv(
|
||||
file_name=c["file_name"],
|
||||
episode_horizon=c["episode_horizon"]))
|
||||
|
||||
# Get policies (different agent types; "behaviors" in MLAgents) and
|
||||
# the mappings from individual agents to Policies.
|
||||
@@ -63,20 +84,35 @@ if __name__ == "__main__":
|
||||
config = {
|
||||
"env": "unity3d",
|
||||
"env_config": {
|
||||
"file_name": args.file_name,
|
||||
"episode_horizon": args.horizon,
|
||||
},
|
||||
# IMPORTANT: Just use one Worker (we only have one Unity running)!
|
||||
"num_workers": 0,
|
||||
# For running in editor, force to use just one Worker (we only have
|
||||
# one Unity running)!
|
||||
"num_workers": args.num_workers if args.file_name else 0,
|
||||
# Other settings.
|
||||
"sample_batch_size": 64,
|
||||
"train_batch_size": 256,
|
||||
"rollout_fragment_length": 20,
|
||||
"lr": 0.0003,
|
||||
"lambda": 0.95,
|
||||
"gamma": 0.99,
|
||||
"sgd_minibatch_size": 256,
|
||||
"train_batch_size": 4000,
|
||||
"num_sgd_iter": 20,
|
||||
"rollout_fragment_length": 200,
|
||||
"clip_param": 0.2,
|
||||
# Multi-agent setup for the particular env.
|
||||
"multiagent": {
|
||||
"policies": policies,
|
||||
"policy_mapping_fn": policy_mapping_fn,
|
||||
},
|
||||
"model": {
|
||||
"fcnet_hiddens": [512, 512],
|
||||
},
|
||||
"framework": "tf",
|
||||
"no_done_at_end": True,
|
||||
# If no executable is provided (use Unity3D editor), do not evaluate,
|
||||
# b/c the editor only allows one connection at a time.
|
||||
"evaluation_interval": 10 if args.file_name else 0,
|
||||
"evaluation_num_episodes": 1,
|
||||
}
|
||||
|
||||
stop = {
|
||||
@@ -86,7 +122,13 @@ if __name__ == "__main__":
|
||||
}
|
||||
|
||||
# Run the experiment.
|
||||
results = tune.run("PPO", config=config, stop=stop, verbose=1)
|
||||
results = tune.run(
|
||||
"PPO",
|
||||
config=config,
|
||||
stop=stop,
|
||||
verbose=1,
|
||||
checkpoint_freq=10,
|
||||
restore=args.from_checkpoint)
|
||||
|
||||
# And check the results.
|
||||
if args.as_test:
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
repeat-after-me-ppo-w-lstm:
|
||||
env: "ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv"
|
||||
env: ray.rllib.examples.env.repeat_after_me_env.RepeatAfterMeEnv
|
||||
run: PPO
|
||||
stop:
|
||||
episode_reward_mean: 50
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
# NOTE: This example will not run w/o a proper config.multiagent setup,
|
||||
# which currently cannot be done in yaml.
|
||||
|
||||
# This setup should learn a decent (not perfect) policy within 100k timesteps
|
||||
# on a single GPU machine (16 CPUS) using 10 workers (collecting data from
|
||||
# 10 compiled game binaries in parallel).
|
||||
# Reported rewards will be the sum of both strikers (+1 if goal) plus the
|
||||
# goalie's reward (-1 if goal) across all within-scene parallelized playing
|
||||
# fields (8 fields with each 2 strikers + 1 goalie, for the soccer env).
|
||||
unity3d-soccer-strikers-vs-goalie-ppo:
|
||||
env: ray.rllib.env.unity3d_env.Unity3DEnv
|
||||
run: PPO
|
||||
stop:
|
||||
timesteps_total: 1000000
|
||||
config:
|
||||
# NOTE: This example will not run w/o the following multiagent setup:
|
||||
# Multi-agent setup for SoccerStrikersVsGoalie Unity3D Env.
|
||||
# multiagent:
|
||||
# policies: [policies list]
|
||||
# policy_mapping_fn: [agent-to-policy mapping function]
|
||||
|
||||
# Works for both torch and tf.
|
||||
framework: tf
|
||||
env_config:
|
||||
# Put the path to your compiled game executable here.
|
||||
file_name: /home/ubuntu/soccer_strikers_vs_goalie_linux.x86_64
|
||||
# Timesteps after which a hard-reset will happen (all agents).
|
||||
episode_horizon: 3000
|
||||
lr: 0.0003
|
||||
lambda: 0.95
|
||||
gamma: 0.99
|
||||
sgd_minibatch_size: 256
|
||||
train_batch_size: 4000
|
||||
clip_param: 0.2
|
||||
# For running in editor, just use one Worker (we only have
|
||||
# one Unity running)!
|
||||
num_workers: 10
|
||||
num_sgd_iter: 20
|
||||
rollout_fragment_length: 200
|
||||
no_done_at_end: true
|
||||
model:
|
||||
fcnet_hiddens: [512, 512]
|
||||
# If no executable is provided (use Unity3D editor), do not evaluate,
|
||||
# b/c the editor only allows one connection at a time.
|
||||
evaluation_interval: 0
|
||||
evaluation_num_episodes: 1
|
||||
Reference in New Issue
Block a user