[rllib] Document env compatibility, Ape-X support for multi-agent (#3147)

This commit is contained in:
Eric Liang
2018-10-31 21:59:34 -07:00
committed by GitHub
parent 2086a57e61
commit cd284bb487
5 changed files with 107 additions and 39 deletions
+46 -1
View File
@@ -5,6 +5,23 @@ RLlib works with several different types of environments, including `OpenAI Gym
.. image:: rllib-envs.svg
**Compatibility matrix**:
============= ================ ================== =========== ==================
Algorithm Discrete Actions Continuous Actions Multi-Agent Recurrent Policies
============= ================ ================== =========== ==================
A2C, A3C **Yes** **Yes** **Yes** **Yes**
PPO **Yes** **Yes** **Yes** **Yes**
PG **Yes** **Yes** **Yes** **Yes**
IMPALA **Yes** No **Yes** **Yes**
DQN, Rainbow **Yes** No **Yes** No
DDPG No **Yes** **Yes** No
APEX-DQN **Yes** No **Yes** No
APEX-DDPG No **Yes** **Yes** No
ES **Yes** **Yes** No No
ARS **Yes** **Yes** No No
============= ================ ================== =========== ==================
In the high-level agent APIs, environments are identified with string names. By default, the string will be interpreted as a gym `environment name <https://gym.openai.com/envs>`__, however you can also register custom environments by name:
.. code-block:: python
@@ -132,7 +149,35 @@ If all the agents will be using the same algorithm class to train, then you can
RLlib will create three distinct policies and route agent decisions to its bound policy. When an agent first appears in the env, ``policy_mapping_fn`` will be called to determine which policy it is bound to. RLlib reports separate training statistics for each policy in the return from ``train()``, along with the combined reward.
Here is a simple `example training script <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/multiagent_cartpole.py>`__ in which you can vary the number of agents and policies in the environment. For how to use multiple training methods at once (here DQN and PPO), see the `two-trainer example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/multiagent_two_trainers.py>`__.
Here is a simple `example training script <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/multiagent_cartpole.py>`__ in which you can vary the number of agents and policies in the environment. For how to use multiple training methods at once (here DQN and PPO), see the `two-trainer example <https://github.com/ray-project/ray/blob/master/python/ray/rllib/examples/multiagent_two_trainers.py>`__. Metrics are reported for each policy separately, for example:
.. code-block:: bash
:emphasize-lines: 6,14,22
Result for PPO_multi_cartpole_0:
episode_len_mean: 34.025862068965516
episode_reward_max: 159.0
episode_reward_mean: 86.06896551724138
info:
policy_0:
cur_lr: 4.999999873689376e-05
entropy: 0.6833480000495911
kl: 0.010264254175126553
policy_loss: -11.95590591430664
total_loss: 197.7039794921875
vf_explained_var: 0.0010995268821716309
vf_loss: 209.6578826904297
policy_1:
cur_lr: 4.999999873689376e-05
entropy: 0.6827034950256348
kl: 0.01119876280426979
policy_loss: -8.787769317626953
total_loss: 88.26161193847656
vf_explained_var: 0.0005457401275634766
vf_loss: 97.0471420288086
policy_reward_mean:
policy_0: 21.194444444444443
policy_1: 21.798387096774192
To scale to hundreds of agents, MultiAgentEnv batches policy evaluations across multiple agents internally. It can also be auto-vectorized by setting ``num_envs_per_worker > 1``.
+1 -1
View File
@@ -47,7 +47,7 @@ class ApexDDPGAgent(DDPGAgent):
def default_resource_request(cls, config):
cf = merge_dicts(cls._default_config, config)
return Resources(
cpu=1 + cf["optimizer"]["num_replay_buffer_shards"],
cpu=1,
gpu=cf["gpu"] and cf["gpu_fraction"] or 0,
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
+1 -1
View File
@@ -50,7 +50,7 @@ class ApexAgent(DQNAgent):
def default_resource_request(cls, config):
cf = merge_dicts(cls._default_config, config)
return Resources(
cpu=1 + cf["optimizer"]["num_replay_buffer_shards"],
cpu=1,
gpu=cf["gpu"] and cf["gpu_fraction"] or 0,
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
@@ -85,7 +85,6 @@ if __name__ == "__main__":
"custom_model": ["model1", "model2"][i % 2],
},
"gamma": random.choice([0.5, 0.8, 0.9, 0.95, 0.99]),
"n_step": random.choice([1, 2, 3, 4, 5]),
}
return (PPOPolicyGraph, obs_space, act_space, config)
@@ -98,12 +97,13 @@ if __name__ == "__main__":
run_experiments({
"test": {
"run": "PG",
"run": "PPO",
"env": "multi_cartpole",
"stop": {
"training_iteration": args.num_iters
},
"config": {
"simple_optimizer": True,
"multiagent": {
"policy_graphs": policy_graphs,
"policy_mapping_fn": tune.function(
@@ -6,6 +6,7 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import os
import random
import time
@@ -15,9 +16,10 @@ import numpy as np
from six.moves import queue
import ray
from ray.rllib.evaluation.sample_batch import SampleBatch, DEFAULT_POLICY_ID, \
MultiAgentBatch
from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
from ray.rllib.optimizers.replay_buffer import PrioritizedReplayBuffer
from ray.rllib.evaluation.sample_batch import SampleBatch
from ray.rllib.utils.actors import TaskPool, create_colocated
from ray.rllib.utils.timer import TimerStat
from ray.rllib.utils.window_stat import WindowStat
@@ -43,49 +45,61 @@ class ReplayActor(object):
self.prioritized_replay_beta = prioritized_replay_beta
self.prioritized_replay_eps = prioritized_replay_eps
self.replay_buffer = PrioritizedReplayBuffer(
self.buffer_size, alpha=prioritized_replay_alpha)
def new_buffer():
return PrioritizedReplayBuffer(
self.buffer_size, alpha=prioritized_replay_alpha)
self.replay_buffers = collections.defaultdict(new_buffer)
# Metrics
self.add_batch_timer = TimerStat()
self.replay_timer = TimerStat()
self.update_priorities_timer = TimerStat()
self.num_added = 0
def get_host(self):
return os.uname()[1]
def add_batch(self, batch):
PolicyOptimizer._check_not_multiagent(batch)
# Handle everything as if multiagent
if isinstance(batch, SampleBatch):
batch = MultiAgentBatch({DEFAULT_POLICY_ID: batch}, batch.count)
with self.add_batch_timer:
for row in batch.rows():
self.replay_buffer.add(row["obs"], row["actions"],
row["rewards"], row["new_obs"],
row["dones"], row["weights"])
for policy_id, s in batch.policy_batches.items():
for row in s.rows():
self.replay_buffers[policy_id].add(
row["obs"], row["actions"], row["rewards"],
row["new_obs"], row["dones"], row["weights"])
self.num_added += batch.count
def replay(self):
if self.num_added < self.replay_starts:
return None
with self.replay_timer:
if len(self.replay_buffer) < self.replay_starts:
return None
samples = {}
for policy_id, replay_buffer in self.replay_buffers.items():
(obses_t, actions, rewards, obses_tp1, dones, weights,
batch_indexes) = replay_buffer.sample(
self.train_batch_size, beta=self.prioritized_replay_beta)
samples[policy_id] = SampleBatch({
"obs": obses_t,
"actions": actions,
"rewards": rewards,
"new_obs": obses_tp1,
"dones": dones,
"weights": weights,
"batch_indexes": batch_indexes
})
return MultiAgentBatch(samples, self.train_batch_size)
(obses_t, actions, rewards, obses_tp1, dones, weights,
batch_indexes) = self.replay_buffer.sample(
self.train_batch_size, beta=self.prioritized_replay_beta)
batch = SampleBatch({
"obs": obses_t,
"actions": actions,
"rewards": rewards,
"new_obs": obses_tp1,
"dones": dones,
"weights": weights,
"batch_indexes": batch_indexes
})
return batch
def update_priorities(self, batch_indexes, td_errors):
def update_priorities(self, prio_dict):
with self.update_priorities_timer:
new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
self.replay_buffer.update_priorities(batch_indexes, new_priorities)
for policy_id, (batch_indexes, td_errors) in prio_dict.items():
new_priorities = (
np.abs(td_errors) + self.prioritized_replay_eps)
self.replay_buffers[policy_id].update_priorities(
batch_indexes, new_priorities)
def stats(self, debug=False):
stat = {
@@ -94,7 +108,10 @@ class ReplayActor(object):
"update_priorities_time_ms": round(
1000 * self.update_priorities_timer.mean, 3),
}
stat.update(self.replay_buffer.stats(debug=debug))
for policy_id, replay_buffer in self.replay_buffers.items():
stat.update({
"policy_{}".format(policy_id): replay_buffer.stats(debug=debug)
})
return stat
@@ -126,10 +143,16 @@ class LearnerThread(threading.Thread):
with self.queue_timer:
ra, replay = self.inqueue.get()
if replay is not None:
prio_dict = {}
with self.grad_timer:
td_error = self.local_evaluator.compute_apply(replay)[
"td_error"]
self.outqueue.put((ra, replay, td_error, replay.count))
grad_out = self.local_evaluator.compute_apply(replay)
for pid, info in grad_out.items():
prio_dict[pid] = (
replay.policy_batches[pid]["batch_indexes"],
info["td_error"])
# send `replay` back also so that it gets released by the original
# thread: https://github.com/ray-project/ray/issues/2610
self.outqueue.put((ra, replay, prio_dict, replay.count))
self.learner_queue_size.push(self.inqueue.qsize())
self.weights_updated = True
@@ -267,8 +290,8 @@ class AsyncReplayOptimizer(PolicyOptimizer):
with self.timers["update_priorities"]:
while not self.learner.outqueue.empty():
ra, replay, td_error, count = self.learner.outqueue.get()
ra.update_priorities.remote(replay["batch_indexes"], td_error)
ra, _, prio_dict, count = self.learner.outqueue.get()
ra.update_priorities.remote(prio_dict)
train_timesteps += count
return sample_timesteps, train_timesteps