diff --git a/doc/source/rllib-api.svg b/doc/source/rllib-api.svg index e157e106f..c5b338250 100644 --- a/doc/source/rllib-api.svg +++ b/doc/source/rllib-api.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index 46d998256..e1f96f949 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -103,6 +103,11 @@ The above basic policy, when run, will produce batches of observations with the assert "other_value" in samples.keys() +Policies in Multi-Agent +~~~~~~~~~~~~~~~~~~~~~~~ + +Beyond being agnostic of framework implementation, one of the main reasons to have a Policy abstraction is for use in multi-agent environments. For example, the `rock-paper-scissors example `__ shows how you can leverage the Policy abstraction to evaluate heuristic policies against learned policies. + Building Policies in TensorFlow ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/rllib-env.rst b/doc/source/rllib-env.rst index b04b91c3c..d3b614e80 100644 --- a/doc/source/rllib-env.rst +++ b/doc/source/rllib-env.rst @@ -218,6 +218,15 @@ Here is a simple `example training script 1``. +Rock Paper Scissors Example +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The `rock_paper_scissors_multiagent.py `__ example demonstrates several types of policies competing against each other: heuristic policies of repeating the same move, beating the last opponent move, and learned LSTM and feedforward policies. + +.. figure:: rock-paper-scissors.png + + TensorBoard output of running the rock-paper-scissors example, where a learned policy faces off between a random selection of the same-move and beat-last-move heuristics. Here the performance of heuristic policies vs the learned policy is compared with LSTM enabled (blue) and a plain feed-forward policy (red). While the feedforward policy can easily beat the same-move heuristic by simply avoiding the last move taken, it takes a LSTM policy to distinguish between and consistently beat both policies. + Hierarchical Environments ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -254,6 +263,10 @@ See this file for a runnable example: `hierarchical_training.py `__, you can put layers in global variables and straightforwardly share those layer objects between models instead of using variable scopes. + RLlib will create each policy's model in a separate ``tf.variable_scope``. However, variables can still be shared between policies by explicitly entering a globally shared variable scope with ``tf.VariableScope(reuse=tf.AUTO_REUSE)``: .. code-block:: python diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index b83a522dd..9e47cc18f 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -55,6 +55,8 @@ Serving and Offline Multi-Agent and Hierarchical ---------------------------- +- `Rock-paper-scissors `__: + Example of different heuristic and learned policies competing against each other in rock-paper-scissors. - `Two-step game `__: Example of the two-step game from the `QMIX paper `__. - `Hand-coded policy `__: @@ -72,13 +74,15 @@ Community Examples ------------------ - `CARLA `__: Example of training autonomous vehicles with RLlib and `CARLA `__ simulator. -- `Traffic Flow `__: - Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. +- `GFootball `__: + Example of setting up a multi-agent version of `GFootball `__ with RLlib. +- `NeuroCuts `__: + Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. - `Roboschool / SageMaker `__: Example of training robotic control policies in SageMaker with RLlib. - `StarCraft2 `__: Example of training in StarCraft2 maps with RLlib / multi-agent. -- `NeuroCuts `__: - Example of building packet classification trees using RLlib / multi-agent in a bandit-like setting. +- `Traffic Flow `__: + Example of optimizing mixed-autonomy traffic simulations with RLlib / multi-agent. - `Sequential Social Dilemma Games `__: Example of using the multi-agent API to model several `social dilemma games `__. diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index 9c365f8fb..196a1659c 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -372,8 +372,6 @@ TensorFlow Eager While RLlib uses TF graph mode for all computations, you can still leverage TF eager to inspect the intermediate state of computations using `tf.py_function `__. Here's an example of using eager mode in `a custom RLlib model and loss `__. -There is also experimental support for running the entire loss function in eager mode. This can be enabled with ``use_eager: True``, e.g., ``rllib train --env=CartPole-v0 --run=PPO --config='{"use_eager": true, "simple_optimizer": true}'``. However this currently only works for a couple algorithms. - Episode Traces ~~~~~~~~~~~~~~ diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index df2d06dff..6b232023b 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -98,6 +98,8 @@ Concepts and Custom Algorithms ------------------------------ * `Policies `__ + - `Policies in Multi-Agent `__ + - `Building Policies in TensorFlow `__ - `Building Policies in TensorFlow Eager `__ diff --git a/doc/source/rock-paper-scissors.png b/doc/source/rock-paper-scissors.png new file mode 100644 index 000000000..fa8c27779 Binary files /dev/null and b/doc/source/rock-paper-scissors.png differ diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 75030bf59..31f7864d7 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -138,6 +138,10 @@ def validate_config(config): "In multi-agent mode, policies will be optimized sequentially " "by the multi-GPU optimizer. Consider setting " "simple_optimizer=True if this doesn't work for you.") + if config["simple_optimizer"]: + logger.warning( + "Using the simple non-minibatch optimizer. This will greatly " + "reduce performance, consider simple_optimizer=False.") if not config["vf_share_layers"]: logger.warning( "FYI: By default, the value function will not share layers " diff --git a/python/ray/rllib/examples/rock_paper_scissors_multiagent.py b/python/ray/rllib/examples/rock_paper_scissors_multiagent.py new file mode 100644 index 000000000..634f5ea1d --- /dev/null +++ b/python/ray/rllib/examples/rock_paper_scissors_multiagent.py @@ -0,0 +1,215 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +"""A simple multi-agent env with two agents playing rock paper scissors. + +This demonstrates running the following policies in competition: + (1) heuristic policy of repeating the same move + (2) heuristic policy of beating the last opponent move + (3) LSTM/feedforward PG policies + (4) LSTM policy with custom safety loss +""" + +import random +from gym.spaces import Discrete + +from ray import tune +from ray.rllib.agents.pg.pg import PGTrainer +from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.policy.policy import Policy +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + +ROCK = 0 +PAPER = 1 +SCISSORS = 2 + + +class RockPaperScissorsEnv(MultiAgentEnv): + """Two-player environment for rock paper scissors. + + The observation is simply the last opponent action.""" + + def __init__(self, _): + self.action_space = Discrete(3) + self.observation_space = Discrete(3) + self.player1 = "player1" + self.player2 = "player2" + self.last_move = None + self.num_moves = 0 + + def reset(self): + self.last_move = (0, 0) + self.num_moves = 0 + return { + self.player1: self.last_move[1], + self.player2: self.last_move[0], + } + + def step(self, action_dict): + move1 = action_dict[self.player1] + move2 = action_dict[self.player2] + self.last_move = (move1, move2) + obs = { + self.player1: self.last_move[1], + self.player2: self.last_move[0], + } + r1, r2 = { + (ROCK, ROCK): (0, 0), + (ROCK, PAPER): (-1, 1), + (ROCK, SCISSORS): (1, -1), + (PAPER, ROCK): (1, -1), + (PAPER, PAPER): (0, 0), + (PAPER, SCISSORS): (-1, 1), + (SCISSORS, ROCK): (-1, 1), + (SCISSORS, PAPER): (1, -1), + (SCISSORS, SCISSORS): (0, 0), + }[move1, move2] + rew = { + self.player1: r1, + self.player2: r2, + } + self.num_moves += 1 + done = { + "__all__": self.num_moves >= 10, + } + return obs, rew, done, {} + + +class AlwaysSameHeuristic(Policy): + """Pick a random move and stick with it for the entire episode.""" + + def __init__(self, observation_space, action_space, config): + Policy.__init__(self, observation_space, action_space, config) + + def get_initial_state(self): + return [random.choice([ROCK, PAPER, SCISSORS])] + + def compute_actions(self, + obs_batch, + state_batches, + prev_action_batch=None, + prev_reward_batch=None, + info_batch=None, + episodes=None, + **kwargs): + return [x for x in state_batches[0]], state_batches, {} + + def learn_on_batch(self, samples): + pass + + def get_weights(self): + pass + + def set_weights(self, weights): + pass + + +class BeatLastHeuristic(Policy): + """Play the move that would beat the last move of the opponent.""" + + def __init__(self, observation_space, action_space, config): + Policy.__init__(self, observation_space, action_space, config) + + def compute_actions(self, + obs_batch, + state_batches, + prev_action_batch=None, + prev_reward_batch=None, + info_batch=None, + episodes=None, + **kwargs): + def successor(x): + if x[ROCK] == 1: + return PAPER + elif x[PAPER] == 1: + return SCISSORS + elif x[SCISSORS] == 1: + return ROCK + + return [successor(x) for x in obs_batch], [], {} + + def learn_on_batch(self, samples): + pass + + def get_weights(self): + pass + + def set_weights(self, weights): + pass + + +def run_same_policy(): + """Use the same policy for both agents (trivial case).""" + + tune.run("PG", config={"env": RockPaperScissorsEnv}) + + +def run_heuristic_vs_learned(use_lstm=False, trainer="PG"): + """Run heuristic policies vs a learned agent. + + The learned agent should eventually reach a reward of ~5 with + use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy + can perform better is since it can distinguish between the always_same vs + beat_last heuristics. + """ + + def select_policy(agent_id): + if agent_id == "player1": + return "learned" + else: + return random.choice(["always_same", "beat_last"]) + + tune.run( + trainer, + stop={"timesteps_total": 400000}, + config={ + "env": RockPaperScissorsEnv, + "gamma": 0.9, + "num_workers": 4, + "num_envs_per_worker": 4, + "sample_batch_size": 10, + "train_batch_size": 200, + "multiagent": { + "policies_to_train": ["learned"], + "policies": { + "always_same": (AlwaysSameHeuristic, Discrete(3), + Discrete(3), {}), + "beat_last": (BeatLastHeuristic, Discrete(3), Discrete(3), + {}), + "learned": (None, Discrete(3), Discrete(3), { + "model": { + "use_lstm": use_lstm + } + }), + }, + "policy_mapping_fn": tune.function(select_policy), + }, + }) + + +def run_with_custom_entropy_loss(): + """Example of customizing the loss function of an existing policy. + + This performs about the same as the default loss does.""" + + def entropy_policy_gradient_loss(policy, batch_tensors): + actions = batch_tensors["actions"] + advantages = batch_tensors["advantages"] + return (-0.1 * policy.action_dist.entropy() - tf.reduce_mean( + policy.action_dist.logp(actions) * advantages)) + + EntropyPolicy = PGTFPolicy.with_updates( + loss_fn=entropy_policy_gradient_loss) + EntropyLossPG = PGTrainer.with_updates( + name="EntropyPG", get_policy_class=lambda _: EntropyPolicy) + run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG) + + +if __name__ == "__main__": + # run_same_policy() + # run_heuristic_vs_learned(use_lstm=False) + run_heuristic_vs_learned(use_lstm=False) + # run_with_custom_entropy_loss() diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py index 70892f120..6f1f6d9cd 100644 --- a/python/ray/rllib/models/__init__.py +++ b/python/ray/rllib/models/__init__.py @@ -2,6 +2,8 @@ from ray.rllib.models.action_dist import ActionDistribution from ray.rllib.models.catalog import ModelCatalog, MODEL_DEFAULTS from ray.rllib.models.model import Model from ray.rllib.models.preprocessors import Preprocessor +from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork +from ray.rllib.models.tf.visionnet_v1 import VisionNetwork __all__ = [ "ActionDistribution", @@ -9,4 +11,6 @@ __all__ = [ "Model", "Preprocessor", "MODEL_DEFAULTS", + "FullyConnectedNetwork", # legacy + "VisionNetwork", # legacy ]