Multiagent model using concatenated observations (#1416)

* working multi action distribution and multiagent model * currently working but the splits arent done in the right place * added shared models * added categorical support and mountain car example * now compatible with generalized advantage estimation * working multiagent code with discrete and continuous example * moved reshaper to utils * code review changes made, ppo action placeholder moved to model catalog, all multiagent code moved out of fcnet * added examples in * added PEP8 compliance * examples are mostly pep8 compliant * removed all flake errors * added examples to jenkins tests * fixed custom options bug * added lines to let docker file find multiagent tests * shortened example run length * corrected nits * fixed flake errors
2026-06-29 05:34:49 +08:00 · 2018-01-18 19:51:31 -08:00
parent 215d526e0d
commit 37076a9ff8
16 changed files with 445 additions and 20 deletions
@@ -0,0 +1,5 @@
+# flake8: noqa
+from ray.rllib.examples.multiagent_mountaincar_env \
+    import MultiAgentMountainCarEnv
+from ray.rllib.examples.multiagent_pendulum_env \
+    import MultiAgentPendulumEnv
@@ -0,0 +1,56 @@
+""" Multiagent mountain car. Each agent outputs an action which
+is summed to form the total action. This is a discrete
+multiagent example
+"""
+
+import gym
+from gym.envs.registration import register
+
+import ray
+import ray.rllib.ppo as ppo
+from ray.tune.registry import get_registry, register_env
+
+env_name = "MultiAgentMountainCarEnv"
+
+env_version_num = 0
+env_name = env_name + '-v' + str(env_version_num)
+
+
+def pass_params_to_gym(env_name):
+    global env_version_num
+
+    register(
+      id=env_name,
+      entry_point='ray.rllib.examples:' + "MultiAgentMountainCarEnv",
+      max_episode_steps=200,
+      kwargs={}
+    )
+
+
+def create_env(env_config):
+    pass_params_to_gym(env_name)
+    env = gym.envs.make(env_name)
+    return env
+
+
+if __name__ == '__main__':
+    register_env(env_name, lambda env_config: create_env(env_config))
+    config = ppo.DEFAULT_CONFIG.copy()
+    horizon = 200
+    num_cpus = 2
+    ray.init(num_cpus=num_cpus, redirect_output=False)
+    config["num_workers"] = num_cpus
+    config["timesteps_per_batch"] = 100
+    config["num_sgd_iter"] = 10
+    config["gamma"] = 0.999
+    config["horizon"] = horizon
+    config["use_gae"] = True
+    config["model"].update({"fcnet_hiddens": [256, 256]})
+    options = {"multiagent_obs_shapes": [2, 2],
+               "multiagent_act_shapes": [3, 3],
+               "multiagent_shared_model": False,
+               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
+    config["model"].update({"custom_options": options})
+    alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config)
+    for i in range(1):
+        alg.train()
@@ -0,0 +1,52 @@
+import math
+from gym.spaces import Box, Tuple, Discrete
+import numpy as np
+from gym.envs.classic_control.mountain_car import MountainCarEnv
+
+"""
+Multiagent mountain car that sums and then
+averages its actions to produce the velocity
+"""
+
+
+class MultiAgentMountainCarEnv(MountainCarEnv):
+    def __init__(self):
+        self.min_position = -1.2
+        self.max_position = 0.6
+        self.max_speed = 0.07
+        self.goal_position = 0.5
+
+        self.low = np.array([self.min_position, -self.max_speed])
+        self.high = np.array([self.max_position, self.max_speed])
+
+        self.viewer = None
+
+        self.action_space = [Discrete(3) for _ in range(2)]
+        self.observation_space = Tuple(tuple(Box(self.low, self.high)
+                                             for _ in range(2)))
+
+        self._seed()
+        self.reset()
+
+    def _step(self, action):
+        summed_act = 0.5 * np.sum(action)
+
+        position, velocity = self.state
+        velocity += (summed_act - 1) * 0.001
+        velocity += math.cos(3 * position) * (-0.0025)
+        velocity = np.clip(velocity, -self.max_speed, self.max_speed)
+        position += velocity
+        position = np.clip(position, self.min_position, self.max_position)
+        if (position == self.min_position and velocity < 0):
+            velocity = 0
+
+        done = bool(position >= self.goal_position)
+
+        reward = position
+
+        self.state = (position, velocity)
+        return [np.array(self.state) for _ in range(2)], reward, done, {}
+
+    def _reset(self):
+        self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
+        return [np.array(self.state) for _ in range(2)]
@@ -0,0 +1,56 @@
+""" Run script for multiagent pendulum env. Each agent outputs a
+torque which is summed to form the total torque. This is a
+continuous multiagent example
+"""
+
+import gym
+from gym.envs.registration import register
+
+import ray
+import ray.rllib.ppo as ppo
+from ray.tune.registry import get_registry, register_env
+
+env_name = "MultiAgentPendulumEnv"
+
+env_version_num = 0
+env_name = env_name + '-v' + str(env_version_num)
+
+
+def pass_params_to_gym(env_name):
+    global env_version_num
+
+    register(
+      id=env_name,
+      entry_point='ray.rllib.examples:' + "MultiAgentPendulumEnv",
+      max_episode_steps=100,
+      kwargs={}
+    )
+
+
+def create_env(env_config):
+    pass_params_to_gym(env_name)
+    env = gym.envs.make(env_name)
+    return env
+
+
+if __name__ == '__main__':
+    register_env(env_name, lambda env_config: create_env(env_config))
+    config = ppo.DEFAULT_CONFIG.copy()
+    horizon = 100
+    num_cpus = 2
+    ray.init(num_cpus=num_cpus, redirect_output=False)
+    config["num_workers"] = num_cpus
+    config["timesteps_per_batch"] = 100
+    config["num_sgd_iter"] = 10
+    config["gamma"] = 0.999
+    config["horizon"] = horizon
+    config["use_gae"] = True
+    config["model"].update({"fcnet_hiddens": [256, 256]})
+    options = {"multiagent_obs_shapes": [3, 3],
+               "multiagent_act_shapes": [1, 1],
+               "multiagent_shared_model": True,
+               "multiagent_fcnet_hiddens": [[32, 32]] * 2}
+    config["model"].update({"custom_options": options})
+    alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config)
+    for i in range(1):
+        alg.train()
@@ -0,0 +1,70 @@
+from gym.spaces import Box, Tuple
+from gym.utils import seeding
+from gym.envs.classic_control.pendulum import PendulumEnv
+import numpy as np
+
+"""
+ Multiagent pendulum that sums its torques to generate an action
+"""
+
+
+class MultiAgentPendulumEnv(PendulumEnv):
+    metadata = {
+      'render.modes': ['human', 'rgb_array'],
+      'video.frames_per_second': 30
+    }
+
+    def __init__(self):
+        self.max_speed = 8
+        self.max_torque = 2.
+        self.dt = .05
+        self.viewer = None
+
+        high = np.array([1., 1., self.max_speed])
+        self.action_space = [Box(low=-self.max_torque / 2,
+                                 high=self.max_torque / 2, shape=(1,))
+                             for _ in range(2)]
+        self.observation_space = Tuple(tuple(Box(low=-high, high=high)
+                                             for _ in range(2)))
+
+        self._seed()
+
+    def _seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def _step(self, u):
+        th, thdot = self.state  # th := theta
+
+        summed_u = np.sum(u)
+        g = 10.
+        m = 1.
+        length = 1.
+        dt = self.dt
+
+        summed_u = np.clip(summed_u, -self.max_torque, self.max_torque)
+        self.last_u = summed_u  # for rendering
+        costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
+            .001 * (summed_u ** 2)
+
+        newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
+                            3. / (m * length ** 2) * summed_u) * dt
+        newth = th + newthdot * dt
+        newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
+
+        self.state = np.array([newth, newthdot])
+        return self._get_obs(), -costs, False, {}
+
+    def _reset(self):
+        high = np.array([np.pi, 1])
+        self.state = self.np_random.uniform(low=-high, high=high)
+        self.last_u = None
+        return self._get_obs()
+
+    def _get_obs(self):
+        theta, thetadot = self.state
+        return [np.array([np.cos(theta), np.sin(theta), thetadot])
+                for _ in range(2)]
+
+    def angle_normalize(self, x):
+        return (((x + np.pi) % (2 * np.pi)) - np.pi)
@@ -5,8 +5,10 @@ from ray.rllib.models.model import Model
 from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.convnet import ConvolutionalNetwork
 from ray.rllib.models.lstm import LSTM
+from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork


 __all__ = ["ActionDistribution", "ActionDistribution", "Categorical",
           "DiagGaussian", "Deterministic", "ModelCatalog", "Model",
-           "FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM"]
+           "FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM",
+           "MultiAgentFullyConnectedNetwork"]
@@ -4,6 +4,7 @@ from __future__ import print_function

 import tensorflow as tf
 import numpy as np
+from ray.rllib.utils.reshaper import Reshaper


 class ActionDistribution(object):
@@ -109,3 +110,49 @@ class Deterministic(ActionDistribution):

    def sample(self):
        return self.inputs
+
+
+class MultiActionDistribution(ActionDistribution):
+    """Action distribution that operates for list of actions.
+
+    Args:
+        inputs (Tensor list): A list of tensors from which to compute samples.
+    """
+    def __init__(self, inputs, action_space, child_distributions):
+        # you actually have to instantiate the child distributions
+        self.reshaper = Reshaper(action_space)
+        split_inputs = self.reshaper.split_tensor(inputs)
+        child_list = []
+        for i, distribution in enumerate(child_distributions):
+            child_list.append(distribution(split_inputs[i]))
+        self.child_distributions = child_list
+
+    def logp(self, x):
+        """The log-likelihood of the action distribution."""
+        split_list = self.reshaper.split_tensor(x)
+        for i, distribution in enumerate(self.child_distributions):
+            # Remove extra categorical dimension
+            if isinstance(distribution, Categorical):
+                split_list[i] = tf.squeeze(split_list[i], axis=-1)
+        log_list = np.asarray([distribution.logp(split_x) for
+                              distribution, split_x in
+                               zip(self.child_distributions, split_list)])
+        return np.sum(log_list)
+
+    def kl(self, other):
+        """The KL-divergence between two action distributions."""
+        kl_list = np.asarray([distribution.kl(other_distribution) for
+                              distribution, other_distribution in
+                              zip(self.child_distributions,
+                                  other.child_distributions)])
+        return np.sum(kl_list)
+
+    def entropy(self):
+        """The entropy of the action distribution."""
+        entropy_list = np.array([s.entropy() for s in
+                                 self.child_distributions])
+        return np.sum(entropy_list)
+
+    def sample(self):
+        """Draw a sample from the action distribution."""
+        return [[s.sample() for s in self.child_distributions]]
@@ -3,15 +3,19 @@ from __future__ import division
 from __future__ import print_function

 import gym
+import numpy as np
+import tensorflow as tf
+from functools import partial

 from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \
    _default_registry

 from ray.rllib.models.action_dist import (
-    Categorical, Deterministic, DiagGaussian)
+    Categorical, Deterministic, DiagGaussian, MultiActionDistribution)
 from ray.rllib.models.preprocessors import get_preprocessor
 from ray.rllib.models.fcnet import FullyConnectedNetwork
 from ray.rllib.models.visionnet import VisionNetwork
+from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork


 MODEL_CONFIGS = [
@@ -66,10 +70,49 @@ class ModelCatalog(object):
                return Deterministic, action_space.shape[0]
        elif isinstance(action_space, gym.spaces.Discrete):
            return Categorical, action_space.n
+        elif isinstance(action_space, list):
+            size = 0
+            child_dist = []
+            for action in action_space:
+                dist, action_size = ModelCatalog.get_action_dist(action)
+                child_dist.append(dist)
+                size += action_size
+            return partial(MultiActionDistribution,
+                           child_distributions=child_dist,
+                           action_space=action_space), size

        raise NotImplementedError(
            "Unsupported args: {} {}".format(action_space, dist_type))

+    @staticmethod
+    def get_action_placeholder(action_space):
+        """Returns an action placeholder that is consistent with the action space
+
+        Args:
+            action_space (Space): Action space of the target gym env.
+        Returns:
+            action_placeholder (Tensor): A placeholder for the actions
+        """
+
+        if isinstance(action_space, gym.spaces.Box):
+            return tf.placeholder(
+                tf.float32, shape=(None, action_space.shape[0]))
+        elif isinstance(action_space, gym.spaces.Discrete):
+            return tf.placeholder(tf.int64, shape=(None,))
+        elif isinstance(action_space, list):
+            size = 0
+            for i in range(len(action_space)):
+                size += np.product(action_space[i].shape)
+            # TODO(ev) this obviously won't work for mixed spaces
+            if isinstance(action_space[0], gym.spaces.Discrete):
+                return tf.placeholder(tf.int64, shape=(None,
+                                                       len(action_space)))
+            elif isinstance(action_space[0], gym.spaces.Box):
+                return tf.placeholder(tf.float32, shape=(None, size))
+        else:
+            raise NotImplementedError("action space {}"
+                                      " not supported".format(action_space))
+
    @staticmethod
    def get_model(registry, inputs, num_outputs, options=dict()):
        """Returns a suitable model conforming to given input and output specs.
@@ -92,6 +135,12 @@ class ModelCatalog(object):

        obs_rank = len(inputs.shape) - 1

+        # num_outputs > 1 used to avoid hitting this with the value function
+        if isinstance(options.get("custom_options", {}).get(
+          "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
+            return MultiAgentFullyConnectedNetwork(inputs,
+                                                   num_outputs, options)
+
        if obs_rank > 1:
            return VisionNetwork(inputs, num_outputs, options)

@@ -141,7 +190,6 @@ class ModelCatalog(object):
        Returns:
            preprocessor (Preprocessor): Preprocessor for the env observations.
        """
-
        for k in options.keys():
            if k not in MODEL_CONFIGS:
                raise Exception(
@@ -14,6 +14,7 @@ class FullyConnectedNetwork(Model):

    def _init(self, inputs, num_outputs, options):
        hiddens = options.get("fcnet_hiddens", [256, 256])
+
        fcnet_activation = options.get("fcnet_activation", "tanh")
        if fcnet_activation == "tanh":
            activation = tf.nn.tanh
@@ -25,14 +26,16 @@ class FullyConnectedNetwork(Model):
            i = 1
            last_layer = inputs
            for size in hiddens:
+                label = "fc{}".format(i)
                last_layer = slim.fully_connected(
                    last_layer, size,
                    weights_initializer=normc_initializer(1.0),
                    activation_fn=activation,
-                    scope="fc{}".format(i))
+                    scope=label)
                i += 1
+            label = "fc_out"
            output = slim.fully_connected(
                last_layer, num_outputs,
                weights_initializer=normc_initializer(0.01),
-                activation_fn=None, scope="fc_out")
+                activation_fn=None, scope=label)
            return output, last_layer
@@ -0,0 +1,43 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+
+from ray.rllib.models.model import Model
+from ray.rllib.models.fcnet import FullyConnectedNetwork
+from ray.rllib.models.action_dist import Reshaper
+
+
+class MultiAgentFullyConnectedNetwork(Model):
+    """Multiagent fully connected network."""
+
+    def _init(self, inputs, num_outputs, options):
+
+        # Split the input and output tensors
+        input_shapes = options["custom_options"]["multiagent_obs_shapes"]
+        output_shapes = options["custom_options"]["multiagent_act_shapes"]
+        input_reshaper = Reshaper(input_shapes)
+        output_reshaper = Reshaper(output_shapes)
+        split_inputs = input_reshaper.split_tensor(inputs)
+        num_actions = output_reshaper.split_number(num_outputs)
+
+        custom_options = options["custom_options"]
+        hiddens = custom_options.get("multiagent_fcnet_hiddens",
+                                     [[256, 256]]*1)
+
+        # check for a shared model
+        shared_model = custom_options.get("multiagent_shared_model", 0)
+        reuse = tf.AUTO_REUSE if shared_model else False
+        outputs = []
+        for i in range(len(hiddens)):
+            with tf.variable_scope("multi{}".format(i), reuse=reuse):
+                sub_options = options.copy()
+                sub_options.update({"fcnet_hiddens": hiddens[i]})
+                # TODO(ev) make this support arbitrary networks
+                fcnet = FullyConnectedNetwork(
+                    split_inputs[i], int(num_actions[i]), sub_options)
+                output = fcnet.outputs
+                outputs.append(output)
+        overall_output = tf.concat(outputs, axis=1)
+        return overall_output, outputs
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import gym.spaces
 import tensorflow as tf

 from ray.rllib.models import ModelCatalog
@@ -18,8 +17,6 @@ class ProximalPolicyLoss(object):
            observations, value_targets, advantages, actions,
            prev_logits, prev_vf_preds, logit_dim,
            kl_coeff, distribution_class, config, sess, registry):
-        assert (isinstance(action_space, gym.spaces.Discrete) or
-                isinstance(action_space, gym.spaces.Box))
        self.prev_dist = distribution_class(prev_logits)

        # Saved so that we can compute actions given different observations
@@ -220,7 +220,6 @@ class PPOAgent(Agent):
            self.local_evaluator.filters, self.remote_evaluators)
        res = self._fetch_metrics_from_remote_evaluators()
        res = res._replace(info=info)
-
        return res

    def _fetch_metrics_from_remote_evaluators(self):
@@ -2,7 +2,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import gym.spaces
 import pickle
 import tensorflow as tf
 import os
@@ -68,16 +67,7 @@ class PPOEvaluator(Evaluator):
        self.advantages = tf.placeholder(tf.float32, shape=(None,))

        action_space = self.env.action_space
-        # TODO(rliaw): pull this into model_catalog
-        if isinstance(action_space, gym.spaces.Box):
-            self.actions = tf.placeholder(
-                tf.float32, shape=(None, action_space.shape[0]))
-        elif isinstance(action_space, gym.spaces.Discrete):
-            self.actions = tf.placeholder(tf.int64, shape=(None,))
-        else:
-            raise NotImplemented(
-                "action space" + str(type(action_space)) +
-                "currently not supported")
+        self.actions = ModelCatalog.get_action_placeholder(action_space)
        self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
            action_space)
        # Log probabilities from the policy before the policy update.
@@ -0,0 +1,46 @@
+import numpy as np
+import tensorflow as tf
+
+
+class Reshaper(object):
+    """
+    This class keeps track of where in the flattened observation space
+    we should be slicing and what the new shapes should be
+    """
+    def __init__(self, env_space):
+        self.shapes = []
+        self.slice_positions = []
+        self.env_space = env_space
+        if isinstance(env_space, list):
+            for space in env_space:
+                # Handle both gym arrays and just lists of inputs length
+                if hasattr(space, "shape"):
+                    arr_shape = np.asarray(space.shape)
+                else:
+                    arr_shape = space
+                self.shapes.append(arr_shape)
+                if len(self.slice_positions) == 0:
+                    self.slice_positions.append(np.product(arr_shape))
+                else:
+                    self.slice_positions.append(np.product(arr_shape) +
+                                                self.slice_positions[-1])
+        else:
+            self.shapes.append(np.asarray(env_space.shape))
+            self.slice_positions.append(np.product(env_space.shape))
+
+    def get_slice_lengths(self):
+        diffed_list = np.diff(self.slice_positions).tolist()
+        diffed_list.insert(0, self.slice_positions[0])
+        return np.asarray(diffed_list).astype(int)
+
+    def split_tensor(self, tensor, axis=-1):
+        # FIXME (ev) This won't work for mixed action distributions like
+        # one agent Gaussian one agent discrete
+        slice_rescale = int(tensor.shape.as_list()[axis] /
+                            int(np.sum(self.get_slice_lengths())))
+        return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
+                        axis=axis)
+
+    def split_number(self, number):
+        slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
+        return slice_rescale*self.get_slice_lengths()
@@ -5,6 +5,7 @@ from __future__ import print_function
 import six.moves.queue as queue
 import threading
 from collections import namedtuple
+import numpy as np


 class PartialRollout(object):
@@ -226,6 +227,10 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
            if length >= horizon:
                terminal = True

+            # Concatenate multiagent actions
+            if isinstance(action, list):
+                action = np.concatenate(action, axis=0).flatten()
+
            # Collect the experience.
            rollout.add(observations=last_observation,
                        actions=action,