[rllib] Add rock paper scissors multi-agent example (#5336)

2026-07-04 07:35:11 +08:00 · 2019-08-01 13:03:59 -07:00
parent bd6dfc994f
commit 20450a4e82
10 changed files with 252 additions and 7 deletions
@@ -138,6 +138,10 @@ def validate_config(config):
            "In multi-agent mode, policies will be optimized sequentially "
            "by the multi-GPU optimizer. Consider setting "
            "simple_optimizer=True if this doesn't work for you.")
+    if config["simple_optimizer"]:
+        logger.warning(
+            "Using the simple non-minibatch optimizer. This will greatly "
+            "reduce performance, consider simple_optimizer=False.")
    if not config["vf_share_layers"]:
        logger.warning(
            "FYI: By default, the value function will not share layers "
@@ -0,0 +1,215 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+"""A simple multi-agent env with two agents playing rock paper scissors.
+
+This demonstrates running the following policies in competition:
+    (1) heuristic policy of repeating the same move
+    (2) heuristic policy of beating the last opponent move
+    (3) LSTM/feedforward PG policies
+    (4) LSTM policy with custom safety loss
+"""
+
+import random
+from gym.spaces import Discrete
+
+from ray import tune
+from ray.rllib.agents.pg.pg import PGTrainer
+from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.policy.policy import Policy
+from ray.rllib.env.multi_agent_env import MultiAgentEnv
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
+
+ROCK = 0
+PAPER = 1
+SCISSORS = 2
+
+
+class RockPaperScissorsEnv(MultiAgentEnv):
+    """Two-player environment for rock paper scissors.
+
+    The observation is simply the last opponent action."""
+
+    def __init__(self, _):
+        self.action_space = Discrete(3)
+        self.observation_space = Discrete(3)
+        self.player1 = "player1"
+        self.player2 = "player2"
+        self.last_move = None
+        self.num_moves = 0
+
+    def reset(self):
+        self.last_move = (0, 0)
+        self.num_moves = 0
+        return {
+            self.player1: self.last_move[1],
+            self.player2: self.last_move[0],
+        }
+
+    def step(self, action_dict):
+        move1 = action_dict[self.player1]
+        move2 = action_dict[self.player2]
+        self.last_move = (move1, move2)
+        obs = {
+            self.player1: self.last_move[1],
+            self.player2: self.last_move[0],
+        }
+        r1, r2 = {
+            (ROCK, ROCK): (0, 0),
+            (ROCK, PAPER): (-1, 1),
+            (ROCK, SCISSORS): (1, -1),
+            (PAPER, ROCK): (1, -1),
+            (PAPER, PAPER): (0, 0),
+            (PAPER, SCISSORS): (-1, 1),
+            (SCISSORS, ROCK): (-1, 1),
+            (SCISSORS, PAPER): (1, -1),
+            (SCISSORS, SCISSORS): (0, 0),
+        }[move1, move2]
+        rew = {
+            self.player1: r1,
+            self.player2: r2,
+        }
+        self.num_moves += 1
+        done = {
+            "__all__": self.num_moves >= 10,
+        }
+        return obs, rew, done, {}
+
+
+class AlwaysSameHeuristic(Policy):
+    """Pick a random move and stick with it for the entire episode."""
+
+    def __init__(self, observation_space, action_space, config):
+        Policy.__init__(self, observation_space, action_space, config)
+
+    def get_initial_state(self):
+        return [random.choice([ROCK, PAPER, SCISSORS])]
+
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        return [x for x in state_batches[0]], state_batches, {}
+
+    def learn_on_batch(self, samples):
+        pass
+
+    def get_weights(self):
+        pass
+
+    def set_weights(self, weights):
+        pass
+
+
+class BeatLastHeuristic(Policy):
+    """Play the move that would beat the last move of the opponent."""
+
+    def __init__(self, observation_space, action_space, config):
+        Policy.__init__(self, observation_space, action_space, config)
+
+    def compute_actions(self,
+                        obs_batch,
+                        state_batches,
+                        prev_action_batch=None,
+                        prev_reward_batch=None,
+                        info_batch=None,
+                        episodes=None,
+                        **kwargs):
+        def successor(x):
+            if x[ROCK] == 1:
+                return PAPER
+            elif x[PAPER] == 1:
+                return SCISSORS
+            elif x[SCISSORS] == 1:
+                return ROCK
+
+        return [successor(x) for x in obs_batch], [], {}
+
+    def learn_on_batch(self, samples):
+        pass
+
+    def get_weights(self):
+        pass
+
+    def set_weights(self, weights):
+        pass
+
+
+def run_same_policy():
+    """Use the same policy for both agents (trivial case)."""
+
+    tune.run("PG", config={"env": RockPaperScissorsEnv})
+
+
+def run_heuristic_vs_learned(use_lstm=False, trainer="PG"):
+    """Run heuristic policies vs a learned agent.
+
+    The learned agent should eventually reach a reward of ~5 with
+    use_lstm=False, and ~7 with use_lstm=True. The reason the LSTM policy
+    can perform better is since it can distinguish between the always_same vs
+    beat_last heuristics.
+    """
+
+    def select_policy(agent_id):
+        if agent_id == "player1":
+            return "learned"
+        else:
+            return random.choice(["always_same", "beat_last"])
+
+    tune.run(
+        trainer,
+        stop={"timesteps_total": 400000},
+        config={
+            "env": RockPaperScissorsEnv,
+            "gamma": 0.9,
+            "num_workers": 4,
+            "num_envs_per_worker": 4,
+            "sample_batch_size": 10,
+            "train_batch_size": 200,
+            "multiagent": {
+                "policies_to_train": ["learned"],
+                "policies": {
+                    "always_same": (AlwaysSameHeuristic, Discrete(3),
+                                    Discrete(3), {}),
+                    "beat_last": (BeatLastHeuristic, Discrete(3), Discrete(3),
+                                  {}),
+                    "learned": (None, Discrete(3), Discrete(3), {
+                        "model": {
+                            "use_lstm": use_lstm
+                        }
+                    }),
+                },
+                "policy_mapping_fn": tune.function(select_policy),
+            },
+        })
+
+
+def run_with_custom_entropy_loss():
+    """Example of customizing the loss function of an existing policy.
+
+    This performs about the same as the default loss does."""
+
+    def entropy_policy_gradient_loss(policy, batch_tensors):
+        actions = batch_tensors["actions"]
+        advantages = batch_tensors["advantages"]
+        return (-0.1 * policy.action_dist.entropy() - tf.reduce_mean(
+            policy.action_dist.logp(actions) * advantages))
+
+    EntropyPolicy = PGTFPolicy.with_updates(
+        loss_fn=entropy_policy_gradient_loss)
+    EntropyLossPG = PGTrainer.with_updates(
+        name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
+    run_heuristic_vs_learned(use_lstm=True, trainer=EntropyLossPG)
+
+
+if __name__ == "__main__":
+    # run_same_policy()
+    # run_heuristic_vs_learned(use_lstm=False)
+    run_heuristic_vs_learned(use_lstm=False)
+    # run_with_custom_entropy_loss()
@@ -2,6 +2,8 @@ from ray.rllib.models.action_dist import ActionDistribution
 from ray.rllib.models.catalog import ModelCatalog, MODEL_DEFAULTS
 from ray.rllib.models.model import Model
 from ray.rllib.models.preprocessors import Preprocessor
+from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork
+from ray.rllib.models.tf.visionnet_v1 import VisionNetwork

 __all__ = [
    "ActionDistribution",
@@ -9,4 +11,6 @@ __all__ = [
    "Model",
    "Preprocessor",
    "MODEL_DEFAULTS",
+    "FullyConnectedNetwork",  # legacy
+    "VisionNetwork",  # legacy
 ]