From 401e656b95e44ee1871a1f1f7f32e2db7b52bbb2 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 15 Jan 2019 16:25:25 -0800 Subject: [PATCH] [rllib] Sync filters at end of iteration not start; hierarchical docs (#3769) --- doc/source/rllib-env.rst | 37 +++++++++++++++++++++++++++++--- doc/source/rllib-stack.svg | 2 +- doc/source/rllib.rst | 2 +- python/ray/rllib/agents/agent.py | 4 +++- 4 files changed, 39 insertions(+), 6 deletions(-) diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index 055ae18fe..686f13ba2 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -108,8 +108,8 @@ Vectorized RLlib will auto-vectorize Gym envs for batch evaluation if the ``num_envs_per_worker`` config is set, or you can define a custom environment class that subclasses `VectorEnv `__ to implement ``vector_step()`` and ``vector_reset()``. -Multi-Agent ------------ +Multi-Agent and Hierarchical +---------------------------- .. note:: @@ -162,7 +162,6 @@ If all the agents will be using the same algorithm class to train, then you can "traffic_light" # Traffic lights are always controlled by this policy if agent_id.startswith("traffic_light_") else random.choice(["car1", "car2"]) # Randomly choose from car policies - }, }, }) @@ -203,6 +202,38 @@ Here is a simple `example training script 1``. +Hierarchical Environments +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Hierarchical training can sometimes be implemented as a special case of multi-agent RL. For example, consider a three-level hierarchy of policies, where a top-level policy issues high level actions that are executed at finer timescales by a mid-level and low-level policy. The following timeline shows one step of the top-level policy, which corresponds to two mid-level actions and five low-level actions: + +.. code-block:: text + + top_level ---------------------------------------------------------------> top_level ---> + mid_level_0 -------------------------------> mid_level_0 ----------------> mid_level_1 -> + low_level_0 -> low_level_0 -> low_level_0 -> low_level_1 -> low_level_1 -> low_level_2 -> + +This can be implemented as a multi-agent environment with three types of agents. Each higher-level action creates a new lower-level agent instance with a new id (e.g., ``low_level_0``, ``low_level_1``, ``low_level_2`` in the above example). These lower-level agents pop in existence at the start of higher-level steps, and terminate when their higher-level action ends. Their experiences are aggregated by policy, so from RLlib's perspective it's just optimizing three different types of policies. The configuration might look something like this: + +.. code-block:: python + + "multiagent": { + "policy_graphs": { + "top_level": (some_policy_graph, ...), + "mid_level": (some_policy_graph, ...), + "low_level": (some_policy_graph, ...), + }, + "policy_mapping_fn": + lambda agent_id: + "low_level" if agent_id.startswith("low_level_") else + "mid_level" if agent_id.startswith("mid_level_") else "top_level" + "policies_to_train": ["top_level"], + }, + + +In this setup, the appropriate rewards for training lower-level agents must be provided by the multi-agent env implementation. The environment class is also responsible for routing between the agents, e.g., conveying `goals `__ from higher-level agents to lower-level agents as part of the lower-level agent observation. + + Grouping Agents ~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib-stack.svg b/doc/source/rllib-stack.svg index c3c18f0be..c81aa29a5 100644 --- a/doc/source/rllib-stack.svg +++ b/doc/source/rllib-stack.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 1bd20ad29..03bbe7f66 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -37,7 +37,7 @@ Environments * `RLlib Environments Overview `__ * `OpenAI Gym `__ * `Vectorized `__ -* `Multi-Agent `__ +* `Multi-Agent and Hierarchical `__ * `Interfacing with External Agents `__ * `Batch Asynchronous `__ diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index 26f12b29a..282c7736d 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -271,6 +271,8 @@ class Agent(Trainable): ev.set_global_vars.remote(self.global_vars) logger.debug("updated global vars: {}".format(self.global_vars)) + result = Trainable.train(self) + if (self.config.get("observation_filter", "NoFilter") != "NoFilter" and hasattr(self, "local_evaluator")): FilterManager.synchronize( @@ -280,12 +282,12 @@ class Agent(Trainable): logger.debug("synchronized filters: {}".format( self.local_evaluator.filters)) - result = Trainable.train(self) if self.config["callbacks"].get("on_train_result"): self.config["callbacks"]["on_train_result"]({ "agent": self, "result": result, }) + return result @override(Trainable)