[rllib] Add examples page, add hierarchical training example, delete SC2 examples (#3815)

* wip * lint * wip * up * wip * update examples * wip * remove carla * update * improve envspec * link to custom * Update rllib-env.rst * update * fix * fn * lint * ds * ssd games * desc * fix up docs * fix
2026-06-28 09:45:24 +08:00 · 2019-01-29 21:06:09 -08:00
parent c9819a721d
commit fb73cedf70
13 changed files with 396 additions and 210 deletions
@@ -130,7 +130,7 @@ COMMON_CONFIG = {
    # Drop metric batches from unresponsive workers after this many seconds
    "collect_metrics_timeout": 180,

-    # === Offline Data Input / Output ===
+    # === Offline Datasets ===
    # __sphinx_doc_input_begin__
    # Specify how to generate experiences:
    #  - "sampler": generate experiences via online simulation (default)
@@ -1,4 +1,11 @@
-"""Example of a custom gym environment. Run this for a demo."""
+"""Example of a custom gym environment. Run this for a demo.
+
+This example shows:
+  - using a custom environment
+  - using Tune for grid search
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+"""

 from __future__ import absolute_import
 from __future__ import division
@@ -7,10 +14,9 @@ from __future__ import print_function
 import numpy as np
 import gym
 from gym.spaces import Discrete, Box
-from gym.envs.registration import EnvSpec

 import ray
-from ray.tune import run_experiments
+from ray.tune import run_experiments, grid_search


 class SimpleCorridor(gym.Env):
@@ -24,7 +30,6 @@ class SimpleCorridor(gym.Env):
        self.action_space = Discrete(2)
        self.observation_space = Box(
            0.0, self.end_pos, shape=(1, ), dtype=np.float32)
-        self._spec = EnvSpec("SimpleCorridor-{}-v0".format(self.end_pos))

    def reset(self):
        self.cur_pos = 0
@@ -48,7 +53,12 @@ if __name__ == "__main__":
        "demo": {
            "run": "PPO",
            "env": SimpleCorridor,  # or "corridor" if registered above
+            "stop": {
+                "timesteps_total": 10000,
+            },
            "config": {
+                "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
+                "num_workers": 1,  # parallelism
                "env_config": {
                    "corridor_length": 5,
                },
@@ -0,0 +1,54 @@
+"""Example of a custom training workflow. Run this for a demo.
+
+This example shows:
+  - using Tune trainable functions to implement custom training workflows
+
+You can visualize experiment results in ~/ray_results using TensorBoard.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import ray
+from ray.rllib.agents.ppo import PPOAgent
+from ray.tune import run_experiments
+
+
+def my_train_fn(config, reporter):
+    # Train for 100 iterations with high LR
+    agent1 = PPOAgent(env="CartPole-v0", config=config)
+    for _ in range(10):
+        result = agent1.train()
+        result["phase"] = 1
+        reporter(**result)
+        phase1_time = result["timesteps_total"]
+    state = agent1.save()
+    agent1.stop()
+
+    # Train for 100 iterations with low LR
+    config["lr"] = 0.0001
+    agent2 = PPOAgent(env="CartPole-v0", config=config)
+    agent2.restore(state)
+    for _ in range(10):
+        result = agent2.train()
+        result["phase"] = 2
+        result["timesteps_total"] += phase1_time  # keep time moving forward
+        reporter(**result)
+    agent2.stop()
+
+
+if __name__ == "__main__":
+    ray.init()
+    run_experiments({
+        "demo": {
+            "run": my_train_fn,
+            "resources_per_trial": {
+                "cpu": 1,
+            },
+            "config": {
+                "lr": 0.01,
+                "num_workers": 0,
+            },
+        },
+    })
@@ -0,0 +1,233 @@
+"""Example of hierarchical training using the multi-agent API.
+
+The example env is that of a "windy maze". The agent observes the current wind
+direction and can either choose to stand still, or move in that direction.
+
+You can try out the env directly with:
+
+    $ python hierarchical_training.py --flat
+
+A simple hierarchical formulation involves a high-level agent that issues goals
+(i.e., go north / south / east / west), and a low-level agent that executes
+these goals over a number of time-steps. This can be implemented as a
+multi-agent environment with a top-level agent and low-level agents spawned
+for each higher-level action. The lower level agent is rewarded for moving
+in the right direction.
+
+You can try this formulation with:
+
+    $ python hierarchical_training.py  # gets ~100 rew after ~100k timesteps
+
+Note that the hierarchical formulation actually converges slightly slower than
+using --flat in this example.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import random
+import gym
+from gym.spaces import Box, Discrete, Tuple
+import logging
+
+import ray
+from ray.tune import run_experiments, function
+from ray.rllib.env import MultiAgentEnv
+from ray.rllib.agents.ppo import PPOAgent
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--flat", action="store_true")
+
+# Agent has to traverse the maze from the starting position S -> F
+# Observation space [x_pos, y_pos, wind_direction]
+# Action space: stay still OR move in current wind direction
+MAP_DATA = """
+#########
+#S      #
+####### #
+      # #
+      # #
+####### #
+#F      #
+#########"""
+
+logger = logging.getLogger(__name__)
+
+
+class WindyMazeEnv(gym.Env):
+    def __init__(self, env_config):
+        self.map = [m for m in MAP_DATA.split("\n") if m]
+        self.x_dim = len(self.map)
+        self.y_dim = len(self.map[0])
+        logger.info("Loaded map {} {}".format(self.x_dim, self.y_dim))
+        for x in range(self.x_dim):
+            for y in range(self.y_dim):
+                if self.map[x][y] == "S":
+                    self.start_pos = (x, y)
+                elif self.map[x][y] == "F":
+                    self.end_pos = (x, y)
+        logger.info("Start pos {} end pos {}".format(self.start_pos,
+                                                     self.end_pos))
+        self.observation_space = Tuple([
+            Box(0, 100, shape=(2, )),  # (x, y)
+            Discrete(4),  # wind direction (N, E, S, W)
+        ])
+        self.action_space = Discrete(2)  # whether to move or not
+
+    def reset(self):
+        self.wind_direction = random.choice([0, 1, 2, 3])
+        self.pos = self.start_pos
+        self.num_steps = 0
+        return [[self.pos[0], self.pos[1]], self.wind_direction]
+
+    def step(self, action):
+        if action == 1:
+            self.pos = self._get_new_pos(self.pos, self.wind_direction)
+        self.num_steps += 1
+        self.wind_direction = random.choice([0, 1, 2, 3])
+        at_goal = self.pos == self.end_pos
+        done = at_goal or self.num_steps >= 200
+        return ([[self.pos[0], self.pos[1]], self.wind_direction],
+                100 * int(at_goal), done, {})
+
+    def _get_new_pos(self, pos, direction):
+        if direction == 0:
+            new_pos = (pos[0] - 1, pos[1])
+        elif direction == 1:
+            new_pos = (pos[0], pos[1] + 1)
+        elif direction == 2:
+            new_pos = (pos[0] + 1, pos[1])
+        elif direction == 3:
+            new_pos = (pos[0], pos[1] - 1)
+        if (new_pos[0] >= 0 and new_pos[0] < self.x_dim and new_pos[1] >= 0
+                and new_pos[1] < self.y_dim
+                and self.map[new_pos[0]][new_pos[1]] != "#"):
+            return new_pos
+        else:
+            return pos  # did not move
+
+
+class HierarchicalWindyMazeEnv(MultiAgentEnv):
+    def __init__(self, env_config):
+        self.flat_env = WindyMazeEnv(env_config)
+
+    def reset(self):
+        self.cur_obs = self.flat_env.reset()
+        self.current_goal = None
+        self.steps_remaining_at_level = None
+        self.num_high_level_steps = 0
+        # current low level agent id. This must be unique for each high level
+        # step since agent ids cannot be reused.
+        self.low_level_agent_id = "low_level_{}".format(
+            self.num_high_level_steps)
+        return {
+            "high_level_agent": self.cur_obs,
+        }
+
+    def step(self, action_dict):
+        assert len(action_dict) == 1, action_dict
+        if "high_level_agent" in action_dict:
+            return self._high_level_step(action_dict["high_level_agent"])
+        else:
+            return self._low_level_step(list(action_dict.values())[0])
+
+    def _high_level_step(self, action):
+        logger.debug("High level agent sets goal".format(action))
+        self.current_goal = action
+        self.steps_remaining_at_level = 25
+        self.num_high_level_steps += 1
+        self.low_level_agent_id = "low_level_{}".format(
+            self.num_high_level_steps)
+        obs = {self.low_level_agent_id: [self.cur_obs, self.current_goal]}
+        rew = {self.low_level_agent_id: 0}
+        done = {"__all__": False}
+        return obs, rew, done, {}
+
+    def _low_level_step(self, action):
+        logger.debug("Low level agent step {}".format(action))
+        self.steps_remaining_at_level -= 1
+        cur_pos = tuple(self.cur_obs[0])
+        goal_pos = self.flat_env._get_new_pos(cur_pos, self.current_goal)
+
+        # Step in the actual env
+        f_obs, f_rew, f_done, _ = self.flat_env.step(action)
+        new_pos = tuple(f_obs[0])
+        self.cur_obs = f_obs
+
+        # Calculate low-level agent observation and reward
+        obs = {self.low_level_agent_id: [f_obs, self.current_goal]}
+        if new_pos != cur_pos:
+            if new_pos == goal_pos:
+                rew = {self.low_level_agent_id: 1}
+            else:
+                rew = {self.low_level_agent_id: -1}
+        else:
+            rew = {self.low_level_agent_id: 0}
+
+        # Handle env termination & transitions back to higher level
+        done = {"__all__": False}
+        if f_done:
+            done["__all__"] = True
+            logger.debug("high level final reward {}".format(f_rew))
+            rew["high_level_agent"] = f_rew
+            obs["high_level_agent"] = f_obs
+        elif self.steps_remaining_at_level == 0:
+            done[self.low_level_agent_id] = True
+            rew["high_level_agent"] = 0
+            obs["high_level_agent"] = f_obs
+
+        return obs, rew, done, {}
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    ray.init()
+    if args.flat:
+        run_experiments({
+            "maze_single": {
+                "run": "PPO",
+                "env": WindyMazeEnv,
+                "config": {
+                    "num_workers": 0,
+                },
+            },
+        })
+    else:
+        maze = WindyMazeEnv(None)
+
+        def policy_mapping_fn(agent_id):
+            if agent_id.startswith("low_level_"):
+                return "low_level_policy"
+            else:
+                return "high_level_policy"
+
+        run_experiments({
+            "maze_hier": {
+                "run": "PPO",
+                "env": HierarchicalWindyMazeEnv,
+                "config": {
+                    "num_workers": 0,
+                    "log_level": "INFO",
+                    "entropy_coeff": 0.01,
+                    "multiagent": {
+                        "policy_graphs": {
+                            "high_level_policy": (PPOAgent._policy_graph,
+                                                  maze.observation_space,
+                                                  Discrete(4), {
+                                                      "gamma": 0.9
+                                                  }),
+                            "low_level_policy": (PPOAgent._policy_graph,
+                                                 Tuple([
+                                                     maze.observation_space,
+                                                     Discrete(4)
+                                                 ]), maze.action_space, {
+                                                     "gamma": 0.0
+                                                 }),
+                        },
+                        "policy_mapping_fn": function(policy_mapping_fn),
+                    },
+                },
+            },
+        })
@@ -1,18 +0,0 @@
-StarCraft on RLlib
-==================
-
-This builds off the StarCraft env in https://github.com/oxwhirl/pymarl_alpha.
-
-Temporary instructions
----------------------
-
-To install, run
-
-```
-git clone https://github.com/oxwhirl/pymarl_alpha
-mv pymarl_alpha ~/pymarl
-cd ~/pymarl
-install_sc1.sh
-install_sc2.sh
-export PYMARL_PATH="~/pymarl"
-```
@@ -1,32 +0,0 @@
-## Adapted from `https://github.com/oxwhirl/pymarl_alpha`.
-
-env: sc2
-
-env_args:
-  map_name: "3m_3m"             # SC2 map name
-  difficulty: "7"               # Very hard
-  move_amount: 2                # How much units are ordered to move per step
-  step_mul: 8                   # How many frames are skiped per step
-  reward_sparse: False          # Only +1/-1 reward for win/defeat (the rest of reward configs are ignored if True)
-  reward_only_positive: True    # Reward is always positive
-  reward_negative_scale: 0.5    # How much to scale negative rewards, ignored if reward_only_positive=True
-  reward_death_value: 10        # Reward for killing an enemy unit and penalty for having an allied unit killed (if reward_only_poitive=False)
-  reward_scale: True            # Whether or not to scale rewards before returning to agents
-  reward_scale_rate: 20         # If reward_scale=True, the agents receive the reward of (max_reward / reward_scale_rate), where max_reward is the maximum possible reward per episode
-  reward_win: 200               # Reward for win
-  reward_defeat: 0              # Reward for defeat (should be nonpositive)
-  state_last_action: True       # Whether the last actions of units is a part of the state
-  obs_instead_of_state: False   # Use combination of all agnets' observations as state
-  obs_own_health: True          # Whether agents receive their own health as a part of observation
-  obs_all_health: True          # Whether agents receive the health of all units (in the sight range) as a part of observataion
-  continuing_episode: False     # Stop/continue episode after its termination
-  game_version: "4.1.2"         # Ignored for Mac/Windows
-  save_replay_prefix: ""        # Prefix of the replay to be saved
-  heuristic: False              # Whether or not use a simple nonlearning hearistic as a controller
-
-test_nepisode: 32
-test_interval: 10000
-log_interval: 2000
-runner_log_interval: 2000
-learner_log_interval: 2000
-t_max: 2000000
@@ -1,153 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-from gym.spaces import Discrete, Box, Dict, Tuple
-import os
-import sys
-import tensorflow as tf
-import tensorflow.contrib.slim as slim
-import yaml
-
-import ray
-from ray.rllib.env.multi_agent_env import MultiAgentEnv
-from ray.tune.registry import register_env
-from ray.rllib.models import Model, ModelCatalog
-from ray.rllib.models.misc import normc_initializer
-from ray.rllib.agents.qmix import QMixAgent
-from ray.rllib.agents.pg import PGAgent
-from ray.rllib.agents.ppo import PPOAgent
-from ray.tune.logger import pretty_print
-
-
-class MaskedActionsModel(Model):
-    def _build_layers_v2(self, input_dict, num_outputs, options):
-        action_mask = input_dict["obs"]["action_mask"]
-        if num_outputs != action_mask.shape[1].value:
-            raise ValueError(
-                "This model assumes num outputs is equal to max avail actions",
-                num_outputs, action_mask)
-
-        # Standard FC net component.
-        last_layer = input_dict["obs"]["obs"]
-        hiddens = [256, 256]
-        for i, size in enumerate(hiddens):
-            label = "fc{}".format(i)
-            last_layer = slim.fully_connected(
-                last_layer,
-                size,
-                weights_initializer=normc_initializer(1.0),
-                activation_fn=tf.nn.tanh,
-                scope=label)
-        action_logits = slim.fully_connected(
-            last_layer,
-            num_outputs,
-            weights_initializer=normc_initializer(0.01),
-            activation_fn=None,
-            scope="fc_out")
-
-        # Mask out invalid actions (use tf.float32.min for stability)
-        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
-        masked_logits = inf_mask + action_logits
-
-        return masked_logits, last_layer
-
-
-class SC2MultiAgentEnv(MultiAgentEnv):
-    """RLlib Wrapper around StarCraft2."""
-
-    def __init__(self, override_cfg):
-        PYMARL_PATH = override_cfg.pop("pymarl_path")
-        os.environ["SC2PATH"] = os.path.join(PYMARL_PATH,
-                                             "3rdparty/StarCraftII")
-        sys.path.append(os.path.join(PYMARL_PATH, "src"))
-        from envs.starcraft2 import StarCraft2Env
-        curpath = os.path.dirname(os.path.abspath(__file__))
-        with open(os.path.join(curpath, "sc2.yaml")) as f:
-            pymarl_args = yaml.load(f)
-            pymarl_args.update(override_cfg)
-            pymarl_args["env_args"].setdefault("seed", 0)
-
-        self._starcraft_env = StarCraft2Env(**pymarl_args)
-        obs_size = self._starcraft_env.get_obs_size()
-        num_actions = self._starcraft_env.get_total_actions()
-        self.observation_space = Dict({
-            "action_mask": Box(0, 1, shape=(num_actions, )),
-            "obs": Box(-1, 1, shape=(obs_size, ))
-        })
-        self.action_space = Discrete(self._starcraft_env.get_total_actions())
-
-    def reset(self):
-        obs_list, state_list = self._starcraft_env.reset()
-        return_obs = {}
-        for i, obs in enumerate(obs_list):
-            return_obs[i] = {
-                "action_mask": self._starcraft_env.get_avail_agent_actions(i),
-                "obs": obs
-            }
-        return return_obs
-
-    def step(self, action_dict):
-        # TODO(rliaw): Check to handle missing agents, if any
-        actions = [action_dict[k] for k in sorted(action_dict)]
-        rew, done, info = self._starcraft_env.step(actions)
-        obs_list = self._starcraft_env.get_obs()
-        return_obs = {}
-        for i, obs in enumerate(obs_list):
-            return_obs[i] = {
-                "action_mask": self._starcraft_env.get_avail_agent_actions(i),
-                "obs": obs
-            }
-        rews = {i: rew / len(obs_list) for i in range(len(obs_list))}
-        dones = {i: done for i in range(len(obs_list))}
-        dones["__all__"] = done
-        infos = {i: info for i in range(len(obs_list))}
-        return return_obs, rews, dones, infos
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--num-iters", type=int, default=100)
-    parser.add_argument("--run", type=str, default="qmix")
-    args = parser.parse_args()
-
-    path_to_pymarl = os.environ.get("PYMARL_PATH",
-                                    os.path.expanduser("~/pymarl/"))
-
-    ray.init()
-    ModelCatalog.register_custom_model("mask_model", MaskedActionsModel)
-
-    register_env("starcraft", lambda cfg: SC2MultiAgentEnv(cfg))
-    agent_cfg = {
-        "observation_filter": "NoFilter",
-        "num_workers": 4,
-        "model": {
-            "custom_model": "mask_model",
-        },
-        "env_config": {
-            "pymarl_path": path_to_pymarl
-        }
-    }
-    if args.run.lower() == "qmix":
-
-        def grouped_sc2(cfg):
-            env = SC2MultiAgentEnv(cfg)
-            agent_list = list(range(env._starcraft_env.n_agents))
-            grouping = {
-                "group_1": agent_list,
-            }
-            obs_space = Tuple([env.observation_space for i in agent_list])
-            act_space = Tuple([env.action_space for i in agent_list])
-            return env.with_agent_groups(
-                grouping, obs_space=obs_space, act_space=act_space)
-
-        register_env("grouped_starcraft", grouped_sc2)
-        agent = QMixAgent(env="grouped_starcraft", config=agent_cfg)
-    elif args.run.lower() == "pg":
-        agent = PGAgent(env="starcraft", config=agent_cfg)
-    elif args.run.lower() == "ppo":
-        agent_cfg.update({"vf_share_layers": True})
-        agent = PPOAgent(env="starcraft", config=agent_cfg)
-    for i in range(args.num_iters):
-        print(pretty_print(agent.train()))