mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 09:45:24 +08:00
[rllib] Add examples page, add hierarchical training example, delete SC2 examples (#3815)
* wip * lint * wip * up * wip * update examples * wip * remove carla * update * improve envspec * link to custom * Update rllib-env.rst * update * fix * fn * lint * ds * ssd games * desc * fix up docs * fix
This commit is contained in:
@@ -130,7 +130,7 @@ COMMON_CONFIG = {
|
||||
# Drop metric batches from unresponsive workers after this many seconds
|
||||
"collect_metrics_timeout": 180,
|
||||
|
||||
# === Offline Data Input / Output ===
|
||||
# === Offline Datasets ===
|
||||
# __sphinx_doc_input_begin__
|
||||
# Specify how to generate experiences:
|
||||
# - "sampler": generate experiences via online simulation (default)
|
||||
|
||||
@@ -1,4 +1,11 @@
|
||||
"""Example of a custom gym environment. Run this for a demo."""
|
||||
"""Example of a custom gym environment. Run this for a demo.
|
||||
|
||||
This example shows:
|
||||
- using a custom environment
|
||||
- using Tune for grid search
|
||||
|
||||
You can visualize experiment results in ~/ray_results using TensorBoard.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
@@ -7,10 +14,9 @@ from __future__ import print_function
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym.spaces import Discrete, Box
|
||||
from gym.envs.registration import EnvSpec
|
||||
|
||||
import ray
|
||||
from ray.tune import run_experiments
|
||||
from ray.tune import run_experiments, grid_search
|
||||
|
||||
|
||||
class SimpleCorridor(gym.Env):
|
||||
@@ -24,7 +30,6 @@ class SimpleCorridor(gym.Env):
|
||||
self.action_space = Discrete(2)
|
||||
self.observation_space = Box(
|
||||
0.0, self.end_pos, shape=(1, ), dtype=np.float32)
|
||||
self._spec = EnvSpec("SimpleCorridor-{}-v0".format(self.end_pos))
|
||||
|
||||
def reset(self):
|
||||
self.cur_pos = 0
|
||||
@@ -48,7 +53,12 @@ if __name__ == "__main__":
|
||||
"demo": {
|
||||
"run": "PPO",
|
||||
"env": SimpleCorridor, # or "corridor" if registered above
|
||||
"stop": {
|
||||
"timesteps_total": 10000,
|
||||
},
|
||||
"config": {
|
||||
"lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs
|
||||
"num_workers": 1, # parallelism
|
||||
"env_config": {
|
||||
"corridor_length": 5,
|
||||
},
|
||||
|
||||
@@ -0,0 +1,54 @@
|
||||
"""Example of a custom training workflow. Run this for a demo.
|
||||
|
||||
This example shows:
|
||||
- using Tune trainable functions to implement custom training workflows
|
||||
|
||||
You can visualize experiment results in ~/ray_results using TensorBoard.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.ppo import PPOAgent
|
||||
from ray.tune import run_experiments
|
||||
|
||||
|
||||
def my_train_fn(config, reporter):
|
||||
# Train for 100 iterations with high LR
|
||||
agent1 = PPOAgent(env="CartPole-v0", config=config)
|
||||
for _ in range(10):
|
||||
result = agent1.train()
|
||||
result["phase"] = 1
|
||||
reporter(**result)
|
||||
phase1_time = result["timesteps_total"]
|
||||
state = agent1.save()
|
||||
agent1.stop()
|
||||
|
||||
# Train for 100 iterations with low LR
|
||||
config["lr"] = 0.0001
|
||||
agent2 = PPOAgent(env="CartPole-v0", config=config)
|
||||
agent2.restore(state)
|
||||
for _ in range(10):
|
||||
result = agent2.train()
|
||||
result["phase"] = 2
|
||||
result["timesteps_total"] += phase1_time # keep time moving forward
|
||||
reporter(**result)
|
||||
agent2.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init()
|
||||
run_experiments({
|
||||
"demo": {
|
||||
"run": my_train_fn,
|
||||
"resources_per_trial": {
|
||||
"cpu": 1,
|
||||
},
|
||||
"config": {
|
||||
"lr": 0.01,
|
||||
"num_workers": 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -0,0 +1,233 @@
|
||||
"""Example of hierarchical training using the multi-agent API.
|
||||
|
||||
The example env is that of a "windy maze". The agent observes the current wind
|
||||
direction and can either choose to stand still, or move in that direction.
|
||||
|
||||
You can try out the env directly with:
|
||||
|
||||
$ python hierarchical_training.py --flat
|
||||
|
||||
A simple hierarchical formulation involves a high-level agent that issues goals
|
||||
(i.e., go north / south / east / west), and a low-level agent that executes
|
||||
these goals over a number of time-steps. This can be implemented as a
|
||||
multi-agent environment with a top-level agent and low-level agents spawned
|
||||
for each higher-level action. The lower level agent is rewarded for moving
|
||||
in the right direction.
|
||||
|
||||
You can try this formulation with:
|
||||
|
||||
$ python hierarchical_training.py # gets ~100 rew after ~100k timesteps
|
||||
|
||||
Note that the hierarchical formulation actually converges slightly slower than
|
||||
using --flat in this example.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import gym
|
||||
from gym.spaces import Box, Discrete, Tuple
|
||||
import logging
|
||||
|
||||
import ray
|
||||
from ray.tune import run_experiments, function
|
||||
from ray.rllib.env import MultiAgentEnv
|
||||
from ray.rllib.agents.ppo import PPOAgent
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--flat", action="store_true")
|
||||
|
||||
# Agent has to traverse the maze from the starting position S -> F
|
||||
# Observation space [x_pos, y_pos, wind_direction]
|
||||
# Action space: stay still OR move in current wind direction
|
||||
MAP_DATA = """
|
||||
#########
|
||||
#S #
|
||||
####### #
|
||||
# #
|
||||
# #
|
||||
####### #
|
||||
#F #
|
||||
#########"""
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WindyMazeEnv(gym.Env):
|
||||
def __init__(self, env_config):
|
||||
self.map = [m for m in MAP_DATA.split("\n") if m]
|
||||
self.x_dim = len(self.map)
|
||||
self.y_dim = len(self.map[0])
|
||||
logger.info("Loaded map {} {}".format(self.x_dim, self.y_dim))
|
||||
for x in range(self.x_dim):
|
||||
for y in range(self.y_dim):
|
||||
if self.map[x][y] == "S":
|
||||
self.start_pos = (x, y)
|
||||
elif self.map[x][y] == "F":
|
||||
self.end_pos = (x, y)
|
||||
logger.info("Start pos {} end pos {}".format(self.start_pos,
|
||||
self.end_pos))
|
||||
self.observation_space = Tuple([
|
||||
Box(0, 100, shape=(2, )), # (x, y)
|
||||
Discrete(4), # wind direction (N, E, S, W)
|
||||
])
|
||||
self.action_space = Discrete(2) # whether to move or not
|
||||
|
||||
def reset(self):
|
||||
self.wind_direction = random.choice([0, 1, 2, 3])
|
||||
self.pos = self.start_pos
|
||||
self.num_steps = 0
|
||||
return [[self.pos[0], self.pos[1]], self.wind_direction]
|
||||
|
||||
def step(self, action):
|
||||
if action == 1:
|
||||
self.pos = self._get_new_pos(self.pos, self.wind_direction)
|
||||
self.num_steps += 1
|
||||
self.wind_direction = random.choice([0, 1, 2, 3])
|
||||
at_goal = self.pos == self.end_pos
|
||||
done = at_goal or self.num_steps >= 200
|
||||
return ([[self.pos[0], self.pos[1]], self.wind_direction],
|
||||
100 * int(at_goal), done, {})
|
||||
|
||||
def _get_new_pos(self, pos, direction):
|
||||
if direction == 0:
|
||||
new_pos = (pos[0] - 1, pos[1])
|
||||
elif direction == 1:
|
||||
new_pos = (pos[0], pos[1] + 1)
|
||||
elif direction == 2:
|
||||
new_pos = (pos[0] + 1, pos[1])
|
||||
elif direction == 3:
|
||||
new_pos = (pos[0], pos[1] - 1)
|
||||
if (new_pos[0] >= 0 and new_pos[0] < self.x_dim and new_pos[1] >= 0
|
||||
and new_pos[1] < self.y_dim
|
||||
and self.map[new_pos[0]][new_pos[1]] != "#"):
|
||||
return new_pos
|
||||
else:
|
||||
return pos # did not move
|
||||
|
||||
|
||||
class HierarchicalWindyMazeEnv(MultiAgentEnv):
|
||||
def __init__(self, env_config):
|
||||
self.flat_env = WindyMazeEnv(env_config)
|
||||
|
||||
def reset(self):
|
||||
self.cur_obs = self.flat_env.reset()
|
||||
self.current_goal = None
|
||||
self.steps_remaining_at_level = None
|
||||
self.num_high_level_steps = 0
|
||||
# current low level agent id. This must be unique for each high level
|
||||
# step since agent ids cannot be reused.
|
||||
self.low_level_agent_id = "low_level_{}".format(
|
||||
self.num_high_level_steps)
|
||||
return {
|
||||
"high_level_agent": self.cur_obs,
|
||||
}
|
||||
|
||||
def step(self, action_dict):
|
||||
assert len(action_dict) == 1, action_dict
|
||||
if "high_level_agent" in action_dict:
|
||||
return self._high_level_step(action_dict["high_level_agent"])
|
||||
else:
|
||||
return self._low_level_step(list(action_dict.values())[0])
|
||||
|
||||
def _high_level_step(self, action):
|
||||
logger.debug("High level agent sets goal".format(action))
|
||||
self.current_goal = action
|
||||
self.steps_remaining_at_level = 25
|
||||
self.num_high_level_steps += 1
|
||||
self.low_level_agent_id = "low_level_{}".format(
|
||||
self.num_high_level_steps)
|
||||
obs = {self.low_level_agent_id: [self.cur_obs, self.current_goal]}
|
||||
rew = {self.low_level_agent_id: 0}
|
||||
done = {"__all__": False}
|
||||
return obs, rew, done, {}
|
||||
|
||||
def _low_level_step(self, action):
|
||||
logger.debug("Low level agent step {}".format(action))
|
||||
self.steps_remaining_at_level -= 1
|
||||
cur_pos = tuple(self.cur_obs[0])
|
||||
goal_pos = self.flat_env._get_new_pos(cur_pos, self.current_goal)
|
||||
|
||||
# Step in the actual env
|
||||
f_obs, f_rew, f_done, _ = self.flat_env.step(action)
|
||||
new_pos = tuple(f_obs[0])
|
||||
self.cur_obs = f_obs
|
||||
|
||||
# Calculate low-level agent observation and reward
|
||||
obs = {self.low_level_agent_id: [f_obs, self.current_goal]}
|
||||
if new_pos != cur_pos:
|
||||
if new_pos == goal_pos:
|
||||
rew = {self.low_level_agent_id: 1}
|
||||
else:
|
||||
rew = {self.low_level_agent_id: -1}
|
||||
else:
|
||||
rew = {self.low_level_agent_id: 0}
|
||||
|
||||
# Handle env termination & transitions back to higher level
|
||||
done = {"__all__": False}
|
||||
if f_done:
|
||||
done["__all__"] = True
|
||||
logger.debug("high level final reward {}".format(f_rew))
|
||||
rew["high_level_agent"] = f_rew
|
||||
obs["high_level_agent"] = f_obs
|
||||
elif self.steps_remaining_at_level == 0:
|
||||
done[self.low_level_agent_id] = True
|
||||
rew["high_level_agent"] = 0
|
||||
obs["high_level_agent"] = f_obs
|
||||
|
||||
return obs, rew, done, {}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init()
|
||||
if args.flat:
|
||||
run_experiments({
|
||||
"maze_single": {
|
||||
"run": "PPO",
|
||||
"env": WindyMazeEnv,
|
||||
"config": {
|
||||
"num_workers": 0,
|
||||
},
|
||||
},
|
||||
})
|
||||
else:
|
||||
maze = WindyMazeEnv(None)
|
||||
|
||||
def policy_mapping_fn(agent_id):
|
||||
if agent_id.startswith("low_level_"):
|
||||
return "low_level_policy"
|
||||
else:
|
||||
return "high_level_policy"
|
||||
|
||||
run_experiments({
|
||||
"maze_hier": {
|
||||
"run": "PPO",
|
||||
"env": HierarchicalWindyMazeEnv,
|
||||
"config": {
|
||||
"num_workers": 0,
|
||||
"log_level": "INFO",
|
||||
"entropy_coeff": 0.01,
|
||||
"multiagent": {
|
||||
"policy_graphs": {
|
||||
"high_level_policy": (PPOAgent._policy_graph,
|
||||
maze.observation_space,
|
||||
Discrete(4), {
|
||||
"gamma": 0.9
|
||||
}),
|
||||
"low_level_policy": (PPOAgent._policy_graph,
|
||||
Tuple([
|
||||
maze.observation_space,
|
||||
Discrete(4)
|
||||
]), maze.action_space, {
|
||||
"gamma": 0.0
|
||||
}),
|
||||
},
|
||||
"policy_mapping_fn": function(policy_mapping_fn),
|
||||
},
|
||||
},
|
||||
},
|
||||
})
|
||||
@@ -1,18 +0,0 @@
|
||||
StarCraft on RLlib
|
||||
==================
|
||||
|
||||
This builds off the StarCraft env in https://github.com/oxwhirl/pymarl_alpha.
|
||||
|
||||
Temporary instructions
|
||||
----------------------
|
||||
|
||||
To install, run
|
||||
|
||||
```
|
||||
git clone https://github.com/oxwhirl/pymarl_alpha
|
||||
mv pymarl_alpha ~/pymarl
|
||||
cd ~/pymarl
|
||||
install_sc1.sh
|
||||
install_sc2.sh
|
||||
export PYMARL_PATH="~/pymarl"
|
||||
```
|
||||
@@ -1,32 +0,0 @@
|
||||
## Adapted from `https://github.com/oxwhirl/pymarl_alpha`.
|
||||
|
||||
env: sc2
|
||||
|
||||
env_args:
|
||||
map_name: "3m_3m" # SC2 map name
|
||||
difficulty: "7" # Very hard
|
||||
move_amount: 2 # How much units are ordered to move per step
|
||||
step_mul: 8 # How many frames are skiped per step
|
||||
reward_sparse: False # Only +1/-1 reward for win/defeat (the rest of reward configs are ignored if True)
|
||||
reward_only_positive: True # Reward is always positive
|
||||
reward_negative_scale: 0.5 # How much to scale negative rewards, ignored if reward_only_positive=True
|
||||
reward_death_value: 10 # Reward for killing an enemy unit and penalty for having an allied unit killed (if reward_only_poitive=False)
|
||||
reward_scale: True # Whether or not to scale rewards before returning to agents
|
||||
reward_scale_rate: 20 # If reward_scale=True, the agents receive the reward of (max_reward / reward_scale_rate), where max_reward is the maximum possible reward per episode
|
||||
reward_win: 200 # Reward for win
|
||||
reward_defeat: 0 # Reward for defeat (should be nonpositive)
|
||||
state_last_action: True # Whether the last actions of units is a part of the state
|
||||
obs_instead_of_state: False # Use combination of all agnets' observations as state
|
||||
obs_own_health: True # Whether agents receive their own health as a part of observation
|
||||
obs_all_health: True # Whether agents receive the health of all units (in the sight range) as a part of observataion
|
||||
continuing_episode: False # Stop/continue episode after its termination
|
||||
game_version: "4.1.2" # Ignored for Mac/Windows
|
||||
save_replay_prefix: "" # Prefix of the replay to be saved
|
||||
heuristic: False # Whether or not use a simple nonlearning hearistic as a controller
|
||||
|
||||
test_nepisode: 32
|
||||
test_interval: 10000
|
||||
log_interval: 2000
|
||||
runner_log_interval: 2000
|
||||
learner_log_interval: 2000
|
||||
t_max: 2000000
|
||||
@@ -1,153 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import argparse
|
||||
from gym.spaces import Discrete, Box, Dict, Tuple
|
||||
import os
|
||||
import sys
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.slim as slim
|
||||
import yaml
|
||||
|
||||
import ray
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.models import Model, ModelCatalog
|
||||
from ray.rllib.models.misc import normc_initializer
|
||||
from ray.rllib.agents.qmix import QMixAgent
|
||||
from ray.rllib.agents.pg import PGAgent
|
||||
from ray.rllib.agents.ppo import PPOAgent
|
||||
from ray.tune.logger import pretty_print
|
||||
|
||||
|
||||
class MaskedActionsModel(Model):
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
action_mask = input_dict["obs"]["action_mask"]
|
||||
if num_outputs != action_mask.shape[1].value:
|
||||
raise ValueError(
|
||||
"This model assumes num outputs is equal to max avail actions",
|
||||
num_outputs, action_mask)
|
||||
|
||||
# Standard FC net component.
|
||||
last_layer = input_dict["obs"]["obs"]
|
||||
hiddens = [256, 256]
|
||||
for i, size in enumerate(hiddens):
|
||||
label = "fc{}".format(i)
|
||||
last_layer = slim.fully_connected(
|
||||
last_layer,
|
||||
size,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
activation_fn=tf.nn.tanh,
|
||||
scope=label)
|
||||
action_logits = slim.fully_connected(
|
||||
last_layer,
|
||||
num_outputs,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None,
|
||||
scope="fc_out")
|
||||
|
||||
# Mask out invalid actions (use tf.float32.min for stability)
|
||||
inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
|
||||
masked_logits = inf_mask + action_logits
|
||||
|
||||
return masked_logits, last_layer
|
||||
|
||||
|
||||
class SC2MultiAgentEnv(MultiAgentEnv):
|
||||
"""RLlib Wrapper around StarCraft2."""
|
||||
|
||||
def __init__(self, override_cfg):
|
||||
PYMARL_PATH = override_cfg.pop("pymarl_path")
|
||||
os.environ["SC2PATH"] = os.path.join(PYMARL_PATH,
|
||||
"3rdparty/StarCraftII")
|
||||
sys.path.append(os.path.join(PYMARL_PATH, "src"))
|
||||
from envs.starcraft2 import StarCraft2Env
|
||||
curpath = os.path.dirname(os.path.abspath(__file__))
|
||||
with open(os.path.join(curpath, "sc2.yaml")) as f:
|
||||
pymarl_args = yaml.load(f)
|
||||
pymarl_args.update(override_cfg)
|
||||
pymarl_args["env_args"].setdefault("seed", 0)
|
||||
|
||||
self._starcraft_env = StarCraft2Env(**pymarl_args)
|
||||
obs_size = self._starcraft_env.get_obs_size()
|
||||
num_actions = self._starcraft_env.get_total_actions()
|
||||
self.observation_space = Dict({
|
||||
"action_mask": Box(0, 1, shape=(num_actions, )),
|
||||
"obs": Box(-1, 1, shape=(obs_size, ))
|
||||
})
|
||||
self.action_space = Discrete(self._starcraft_env.get_total_actions())
|
||||
|
||||
def reset(self):
|
||||
obs_list, state_list = self._starcraft_env.reset()
|
||||
return_obs = {}
|
||||
for i, obs in enumerate(obs_list):
|
||||
return_obs[i] = {
|
||||
"action_mask": self._starcraft_env.get_avail_agent_actions(i),
|
||||
"obs": obs
|
||||
}
|
||||
return return_obs
|
||||
|
||||
def step(self, action_dict):
|
||||
# TODO(rliaw): Check to handle missing agents, if any
|
||||
actions = [action_dict[k] for k in sorted(action_dict)]
|
||||
rew, done, info = self._starcraft_env.step(actions)
|
||||
obs_list = self._starcraft_env.get_obs()
|
||||
return_obs = {}
|
||||
for i, obs in enumerate(obs_list):
|
||||
return_obs[i] = {
|
||||
"action_mask": self._starcraft_env.get_avail_agent_actions(i),
|
||||
"obs": obs
|
||||
}
|
||||
rews = {i: rew / len(obs_list) for i in range(len(obs_list))}
|
||||
dones = {i: done for i in range(len(obs_list))}
|
||||
dones["__all__"] = done
|
||||
infos = {i: info for i in range(len(obs_list))}
|
||||
return return_obs, rews, dones, infos
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=100)
|
||||
parser.add_argument("--run", type=str, default="qmix")
|
||||
args = parser.parse_args()
|
||||
|
||||
path_to_pymarl = os.environ.get("PYMARL_PATH",
|
||||
os.path.expanduser("~/pymarl/"))
|
||||
|
||||
ray.init()
|
||||
ModelCatalog.register_custom_model("mask_model", MaskedActionsModel)
|
||||
|
||||
register_env("starcraft", lambda cfg: SC2MultiAgentEnv(cfg))
|
||||
agent_cfg = {
|
||||
"observation_filter": "NoFilter",
|
||||
"num_workers": 4,
|
||||
"model": {
|
||||
"custom_model": "mask_model",
|
||||
},
|
||||
"env_config": {
|
||||
"pymarl_path": path_to_pymarl
|
||||
}
|
||||
}
|
||||
if args.run.lower() == "qmix":
|
||||
|
||||
def grouped_sc2(cfg):
|
||||
env = SC2MultiAgentEnv(cfg)
|
||||
agent_list = list(range(env._starcraft_env.n_agents))
|
||||
grouping = {
|
||||
"group_1": agent_list,
|
||||
}
|
||||
obs_space = Tuple([env.observation_space for i in agent_list])
|
||||
act_space = Tuple([env.action_space for i in agent_list])
|
||||
return env.with_agent_groups(
|
||||
grouping, obs_space=obs_space, act_space=act_space)
|
||||
|
||||
register_env("grouped_starcraft", grouped_sc2)
|
||||
agent = QMixAgent(env="grouped_starcraft", config=agent_cfg)
|
||||
elif args.run.lower() == "pg":
|
||||
agent = PGAgent(env="starcraft", config=agent_cfg)
|
||||
elif args.run.lower() == "ppo":
|
||||
agent_cfg.update({"vf_share_layers": True})
|
||||
agent = PPOAgent(env="starcraft", config=agent_cfg)
|
||||
for i in range(args.num_iters):
|
||||
print(pretty_print(agent.train()))
|
||||
Reference in New Issue
Block a user