Multiagent model using concatenated observations (#1416)

* working multi action distribution and multiagent model

* currently working but the splits arent done in the right place

* added shared models

* added categorical support and mountain car example

* now compatible with generalized advantage estimation

* working multiagent code with discrete and continuous example

* moved reshaper to utils

* code review changes made, ppo action placeholder moved to model catalog, all multiagent code moved out of fcnet

* added examples in

* added PEP8 compliance

* examples are mostly pep8 compliant

* removed all flake errors

* added examples to jenkins tests

* fixed custom options bug

* added lines to let docker file find multiagent tests

* shortened example run length

* corrected nits

* fixed flake errors
This commit is contained in:
eugenevinitsky
2018-01-18 19:51:31 -08:00
committed by Eric Liang
parent 215d526e0d
commit 37076a9ff8
16 changed files with 445 additions and 20 deletions
+5
View File
@@ -0,0 +1,5 @@
# flake8: noqa
from ray.rllib.examples.multiagent_mountaincar_env \
import MultiAgentMountainCarEnv
from ray.rllib.examples.multiagent_pendulum_env \
import MultiAgentPendulumEnv
@@ -0,0 +1,56 @@
""" Multiagent mountain car. Each agent outputs an action which
is summed to form the total action. This is a discrete
multiagent example
"""
import gym
from gym.envs.registration import register
import ray
import ray.rllib.ppo as ppo
from ray.tune.registry import get_registry, register_env
env_name = "MultiAgentMountainCarEnv"
env_version_num = 0
env_name = env_name + '-v' + str(env_version_num)
def pass_params_to_gym(env_name):
global env_version_num
register(
id=env_name,
entry_point='ray.rllib.examples:' + "MultiAgentMountainCarEnv",
max_episode_steps=200,
kwargs={}
)
def create_env(env_config):
pass_params_to_gym(env_name)
env = gym.envs.make(env_name)
return env
if __name__ == '__main__':
register_env(env_name, lambda env_config: create_env(env_config))
config = ppo.DEFAULT_CONFIG.copy()
horizon = 200
num_cpus = 2
ray.init(num_cpus=num_cpus, redirect_output=False)
config["num_workers"] = num_cpus
config["timesteps_per_batch"] = 100
config["num_sgd_iter"] = 10
config["gamma"] = 0.999
config["horizon"] = horizon
config["use_gae"] = True
config["model"].update({"fcnet_hiddens": [256, 256]})
options = {"multiagent_obs_shapes": [2, 2],
"multiagent_act_shapes": [3, 3],
"multiagent_shared_model": False,
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
config["model"].update({"custom_options": options})
alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config)
for i in range(1):
alg.train()
@@ -0,0 +1,52 @@
import math
from gym.spaces import Box, Tuple, Discrete
import numpy as np
from gym.envs.classic_control.mountain_car import MountainCarEnv
"""
Multiagent mountain car that sums and then
averages its actions to produce the velocity
"""
class MultiAgentMountainCarEnv(MountainCarEnv):
def __init__(self):
self.min_position = -1.2
self.max_position = 0.6
self.max_speed = 0.07
self.goal_position = 0.5
self.low = np.array([self.min_position, -self.max_speed])
self.high = np.array([self.max_position, self.max_speed])
self.viewer = None
self.action_space = [Discrete(3) for _ in range(2)]
self.observation_space = Tuple(tuple(Box(self.low, self.high)
for _ in range(2)))
self._seed()
self.reset()
def _step(self, action):
summed_act = 0.5 * np.sum(action)
position, velocity = self.state
velocity += (summed_act - 1) * 0.001
velocity += math.cos(3 * position) * (-0.0025)
velocity = np.clip(velocity, -self.max_speed, self.max_speed)
position += velocity
position = np.clip(position, self.min_position, self.max_position)
if (position == self.min_position and velocity < 0):
velocity = 0
done = bool(position >= self.goal_position)
reward = position
self.state = (position, velocity)
return [np.array(self.state) for _ in range(2)], reward, done, {}
def _reset(self):
self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
return [np.array(self.state) for _ in range(2)]
@@ -0,0 +1,56 @@
""" Run script for multiagent pendulum env. Each agent outputs a
torque which is summed to form the total torque. This is a
continuous multiagent example
"""
import gym
from gym.envs.registration import register
import ray
import ray.rllib.ppo as ppo
from ray.tune.registry import get_registry, register_env
env_name = "MultiAgentPendulumEnv"
env_version_num = 0
env_name = env_name + '-v' + str(env_version_num)
def pass_params_to_gym(env_name):
global env_version_num
register(
id=env_name,
entry_point='ray.rllib.examples:' + "MultiAgentPendulumEnv",
max_episode_steps=100,
kwargs={}
)
def create_env(env_config):
pass_params_to_gym(env_name)
env = gym.envs.make(env_name)
return env
if __name__ == '__main__':
register_env(env_name, lambda env_config: create_env(env_config))
config = ppo.DEFAULT_CONFIG.copy()
horizon = 100
num_cpus = 2
ray.init(num_cpus=num_cpus, redirect_output=False)
config["num_workers"] = num_cpus
config["timesteps_per_batch"] = 100
config["num_sgd_iter"] = 10
config["gamma"] = 0.999
config["horizon"] = horizon
config["use_gae"] = True
config["model"].update({"fcnet_hiddens": [256, 256]})
options = {"multiagent_obs_shapes": [3, 3],
"multiagent_act_shapes": [1, 1],
"multiagent_shared_model": True,
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
config["model"].update({"custom_options": options})
alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config)
for i in range(1):
alg.train()
@@ -0,0 +1,70 @@
from gym.spaces import Box, Tuple
from gym.utils import seeding
from gym.envs.classic_control.pendulum import PendulumEnv
import numpy as np
"""
Multiagent pendulum that sums its torques to generate an action
"""
class MultiAgentPendulumEnv(PendulumEnv):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
}
def __init__(self):
self.max_speed = 8
self.max_torque = 2.
self.dt = .05
self.viewer = None
high = np.array([1., 1., self.max_speed])
self.action_space = [Box(low=-self.max_torque / 2,
high=self.max_torque / 2, shape=(1,))
for _ in range(2)]
self.observation_space = Tuple(tuple(Box(low=-high, high=high)
for _ in range(2)))
self._seed()
def _seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def _step(self, u):
th, thdot = self.state # th := theta
summed_u = np.sum(u)
g = 10.
m = 1.
length = 1.
dt = self.dt
summed_u = np.clip(summed_u, -self.max_torque, self.max_torque)
self.last_u = summed_u # for rendering
costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
.001 * (summed_u ** 2)
newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
3. / (m * length ** 2) * summed_u) * dt
newth = th + newthdot * dt
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
self.state = np.array([newth, newthdot])
return self._get_obs(), -costs, False, {}
def _reset(self):
high = np.array([np.pi, 1])
self.state = self.np_random.uniform(low=-high, high=high)
self.last_u = None
return self._get_obs()
def _get_obs(self):
theta, thetadot = self.state
return [np.array([np.cos(theta), np.sin(theta), thetadot])
for _ in range(2)]
def angle_normalize(self, x):
return (((x + np.pi) % (2 * np.pi)) - np.pi)
+3 -1
View File
@@ -5,8 +5,10 @@ from ray.rllib.models.model import Model
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.convnet import ConvolutionalNetwork
from ray.rllib.models.lstm import LSTM
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
__all__ = ["ActionDistribution", "ActionDistribution", "Categorical",
"DiagGaussian", "Deterministic", "ModelCatalog", "Model",
"FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM"]
"FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM",
"MultiAgentFullyConnectedNetwork"]
+47
View File
@@ -4,6 +4,7 @@ from __future__ import print_function
import tensorflow as tf
import numpy as np
from ray.rllib.utils.reshaper import Reshaper
class ActionDistribution(object):
@@ -109,3 +110,49 @@ class Deterministic(ActionDistribution):
def sample(self):
return self.inputs
class MultiActionDistribution(ActionDistribution):
"""Action distribution that operates for list of actions.
Args:
inputs (Tensor list): A list of tensors from which to compute samples.
"""
def __init__(self, inputs, action_space, child_distributions):
# you actually have to instantiate the child distributions
self.reshaper = Reshaper(action_space)
split_inputs = self.reshaper.split_tensor(inputs)
child_list = []
for i, distribution in enumerate(child_distributions):
child_list.append(distribution(split_inputs[i]))
self.child_distributions = child_list
def logp(self, x):
"""The log-likelihood of the action distribution."""
split_list = self.reshaper.split_tensor(x)
for i, distribution in enumerate(self.child_distributions):
# Remove extra categorical dimension
if isinstance(distribution, Categorical):
split_list[i] = tf.squeeze(split_list[i], axis=-1)
log_list = np.asarray([distribution.logp(split_x) for
distribution, split_x in
zip(self.child_distributions, split_list)])
return np.sum(log_list)
def kl(self, other):
"""The KL-divergence between two action distributions."""
kl_list = np.asarray([distribution.kl(other_distribution) for
distribution, other_distribution in
zip(self.child_distributions,
other.child_distributions)])
return np.sum(kl_list)
def entropy(self):
"""The entropy of the action distribution."""
entropy_list = np.array([s.entropy() for s in
self.child_distributions])
return np.sum(entropy_list)
def sample(self):
"""Draw a sample from the action distribution."""
return [[s.sample() for s in self.child_distributions]]
+50 -2
View File
@@ -3,15 +3,19 @@ from __future__ import division
from __future__ import print_function
import gym
import numpy as np
import tensorflow as tf
from functools import partial
from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \
_default_registry
from ray.rllib.models.action_dist import (
Categorical, Deterministic, DiagGaussian)
Categorical, Deterministic, DiagGaussian, MultiActionDistribution)
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.visionnet import VisionNetwork
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
MODEL_CONFIGS = [
@@ -66,10 +70,49 @@ class ModelCatalog(object):
return Deterministic, action_space.shape[0]
elif isinstance(action_space, gym.spaces.Discrete):
return Categorical, action_space.n
elif isinstance(action_space, list):
size = 0
child_dist = []
for action in action_space:
dist, action_size = ModelCatalog.get_action_dist(action)
child_dist.append(dist)
size += action_size
return partial(MultiActionDistribution,
child_distributions=child_dist,
action_space=action_space), size
raise NotImplementedError(
"Unsupported args: {} {}".format(action_space, dist_type))
@staticmethod
def get_action_placeholder(action_space):
"""Returns an action placeholder that is consistent with the action space
Args:
action_space (Space): Action space of the target gym env.
Returns:
action_placeholder (Tensor): A placeholder for the actions
"""
if isinstance(action_space, gym.spaces.Box):
return tf.placeholder(
tf.float32, shape=(None, action_space.shape[0]))
elif isinstance(action_space, gym.spaces.Discrete):
return tf.placeholder(tf.int64, shape=(None,))
elif isinstance(action_space, list):
size = 0
for i in range(len(action_space)):
size += np.product(action_space[i].shape)
# TODO(ev) this obviously won't work for mixed spaces
if isinstance(action_space[0], gym.spaces.Discrete):
return tf.placeholder(tf.int64, shape=(None,
len(action_space)))
elif isinstance(action_space[0], gym.spaces.Box):
return tf.placeholder(tf.float32, shape=(None, size))
else:
raise NotImplementedError("action space {}"
" not supported".format(action_space))
@staticmethod
def get_model(registry, inputs, num_outputs, options=dict()):
"""Returns a suitable model conforming to given input and output specs.
@@ -92,6 +135,12 @@ class ModelCatalog(object):
obs_rank = len(inputs.shape) - 1
# num_outputs > 1 used to avoid hitting this with the value function
if isinstance(options.get("custom_options", {}).get(
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
return MultiAgentFullyConnectedNetwork(inputs,
num_outputs, options)
if obs_rank > 1:
return VisionNetwork(inputs, num_outputs, options)
@@ -141,7 +190,6 @@ class ModelCatalog(object):
Returns:
preprocessor (Preprocessor): Preprocessor for the env observations.
"""
for k in options.keys():
if k not in MODEL_CONFIGS:
raise Exception(
+5 -2
View File
@@ -14,6 +14,7 @@ class FullyConnectedNetwork(Model):
def _init(self, inputs, num_outputs, options):
hiddens = options.get("fcnet_hiddens", [256, 256])
fcnet_activation = options.get("fcnet_activation", "tanh")
if fcnet_activation == "tanh":
activation = tf.nn.tanh
@@ -25,14 +26,16 @@ class FullyConnectedNetwork(Model):
i = 1
last_layer = inputs
for size in hiddens:
label = "fc{}".format(i)
last_layer = slim.fully_connected(
last_layer, size,
weights_initializer=normc_initializer(1.0),
activation_fn=activation,
scope="fc{}".format(i))
scope=label)
i += 1
label = "fc_out"
output = slim.fully_connected(
last_layer, num_outputs,
weights_initializer=normc_initializer(0.01),
activation_fn=None, scope="fc_out")
activation_fn=None, scope=label)
return output, last_layer
@@ -0,0 +1,43 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.model import Model
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.action_dist import Reshaper
class MultiAgentFullyConnectedNetwork(Model):
"""Multiagent fully connected network."""
def _init(self, inputs, num_outputs, options):
# Split the input and output tensors
input_shapes = options["custom_options"]["multiagent_obs_shapes"]
output_shapes = options["custom_options"]["multiagent_act_shapes"]
input_reshaper = Reshaper(input_shapes)
output_reshaper = Reshaper(output_shapes)
split_inputs = input_reshaper.split_tensor(inputs)
num_actions = output_reshaper.split_number(num_outputs)
custom_options = options["custom_options"]
hiddens = custom_options.get("multiagent_fcnet_hiddens",
[[256, 256]]*1)
# check for a shared model
shared_model = custom_options.get("multiagent_shared_model", 0)
reuse = tf.AUTO_REUSE if shared_model else False
outputs = []
for i in range(len(hiddens)):
with tf.variable_scope("multi{}".format(i), reuse=reuse):
sub_options = options.copy()
sub_options.update({"fcnet_hiddens": hiddens[i]})
# TODO(ev) make this support arbitrary networks
fcnet = FullyConnectedNetwork(
split_inputs[i], int(num_actions[i]), sub_options)
output = fcnet.outputs
outputs.append(output)
overall_output = tf.concat(outputs, axis=1)
return overall_output, outputs
-3
View File
@@ -2,7 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym.spaces
import tensorflow as tf
from ray.rllib.models import ModelCatalog
@@ -18,8 +17,6 @@ class ProximalPolicyLoss(object):
observations, value_targets, advantages, actions,
prev_logits, prev_vf_preds, logit_dim,
kl_coeff, distribution_class, config, sess, registry):
assert (isinstance(action_space, gym.spaces.Discrete) or
isinstance(action_space, gym.spaces.Box))
self.prev_dist = distribution_class(prev_logits)
# Saved so that we can compute actions given different observations
-1
View File
@@ -220,7 +220,6 @@ class PPOAgent(Agent):
self.local_evaluator.filters, self.remote_evaluators)
res = self._fetch_metrics_from_remote_evaluators()
res = res._replace(info=info)
return res
def _fetch_metrics_from_remote_evaluators(self):
+1 -11
View File
@@ -2,7 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym.spaces
import pickle
import tensorflow as tf
import os
@@ -68,16 +67,7 @@ class PPOEvaluator(Evaluator):
self.advantages = tf.placeholder(tf.float32, shape=(None,))
action_space = self.env.action_space
# TODO(rliaw): pull this into model_catalog
if isinstance(action_space, gym.spaces.Box):
self.actions = tf.placeholder(
tf.float32, shape=(None, action_space.shape[0]))
elif isinstance(action_space, gym.spaces.Discrete):
self.actions = tf.placeholder(tf.int64, shape=(None,))
else:
raise NotImplemented(
"action space" + str(type(action_space)) +
"currently not supported")
self.actions = ModelCatalog.get_action_placeholder(action_space)
self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
action_space)
# Log probabilities from the policy before the policy update.
+46
View File
@@ -0,0 +1,46 @@
import numpy as np
import tensorflow as tf
class Reshaper(object):
"""
This class keeps track of where in the flattened observation space
we should be slicing and what the new shapes should be
"""
def __init__(self, env_space):
self.shapes = []
self.slice_positions = []
self.env_space = env_space
if isinstance(env_space, list):
for space in env_space:
# Handle both gym arrays and just lists of inputs length
if hasattr(space, "shape"):
arr_shape = np.asarray(space.shape)
else:
arr_shape = space
self.shapes.append(arr_shape)
if len(self.slice_positions) == 0:
self.slice_positions.append(np.product(arr_shape))
else:
self.slice_positions.append(np.product(arr_shape) +
self.slice_positions[-1])
else:
self.shapes.append(np.asarray(env_space.shape))
self.slice_positions.append(np.product(env_space.shape))
def get_slice_lengths(self):
diffed_list = np.diff(self.slice_positions).tolist()
diffed_list.insert(0, self.slice_positions[0])
return np.asarray(diffed_list).astype(int)
def split_tensor(self, tensor, axis=-1):
# FIXME (ev) This won't work for mixed action distributions like
# one agent Gaussian one agent discrete
slice_rescale = int(tensor.shape.as_list()[axis] /
int(np.sum(self.get_slice_lengths())))
return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
axis=axis)
def split_number(self, number):
slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
return slice_rescale*self.get_slice_lengths()
+5
View File
@@ -5,6 +5,7 @@ from __future__ import print_function
import six.moves.queue as queue
import threading
from collections import namedtuple
import numpy as np
class PartialRollout(object):
@@ -226,6 +227,10 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
if length >= horizon:
terminal = True
# Concatenate multiagent actions
if isinstance(action, list):
action = np.concatenate(action, axis=0).flatten()
# Collect the experience.
rollout.add(observations=last_observation,
actions=action,