mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 05:34:49 +08:00
Multiagent model using concatenated observations (#1416)
* working multi action distribution and multiagent model * currently working but the splits arent done in the right place * added shared models * added categorical support and mountain car example * now compatible with generalized advantage estimation * working multiagent code with discrete and continuous example * moved reshaper to utils * code review changes made, ppo action placeholder moved to model catalog, all multiagent code moved out of fcnet * added examples in * added PEP8 compliance * examples are mostly pep8 compliant * removed all flake errors * added examples to jenkins tests * fixed custom options bug * added lines to let docker file find multiagent tests * shortened example run length * corrected nits * fixed flake errors
This commit is contained in:
committed by
Eric Liang
parent
215d526e0d
commit
37076a9ff8
@@ -0,0 +1,5 @@
|
||||
# flake8: noqa
|
||||
from ray.rllib.examples.multiagent_mountaincar_env \
|
||||
import MultiAgentMountainCarEnv
|
||||
from ray.rllib.examples.multiagent_pendulum_env \
|
||||
import MultiAgentPendulumEnv
|
||||
@@ -0,0 +1,56 @@
|
||||
""" Multiagent mountain car. Each agent outputs an action which
|
||||
is summed to form the total action. This is a discrete
|
||||
multiagent example
|
||||
"""
|
||||
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import ray
|
||||
import ray.rllib.ppo as ppo
|
||||
from ray.tune.registry import get_registry, register_env
|
||||
|
||||
env_name = "MultiAgentMountainCarEnv"
|
||||
|
||||
env_version_num = 0
|
||||
env_name = env_name + '-v' + str(env_version_num)
|
||||
|
||||
|
||||
def pass_params_to_gym(env_name):
|
||||
global env_version_num
|
||||
|
||||
register(
|
||||
id=env_name,
|
||||
entry_point='ray.rllib.examples:' + "MultiAgentMountainCarEnv",
|
||||
max_episode_steps=200,
|
||||
kwargs={}
|
||||
)
|
||||
|
||||
|
||||
def create_env(env_config):
|
||||
pass_params_to_gym(env_name)
|
||||
env = gym.envs.make(env_name)
|
||||
return env
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
register_env(env_name, lambda env_config: create_env(env_config))
|
||||
config = ppo.DEFAULT_CONFIG.copy()
|
||||
horizon = 200
|
||||
num_cpus = 2
|
||||
ray.init(num_cpus=num_cpus, redirect_output=False)
|
||||
config["num_workers"] = num_cpus
|
||||
config["timesteps_per_batch"] = 100
|
||||
config["num_sgd_iter"] = 10
|
||||
config["gamma"] = 0.999
|
||||
config["horizon"] = horizon
|
||||
config["use_gae"] = True
|
||||
config["model"].update({"fcnet_hiddens": [256, 256]})
|
||||
options = {"multiagent_obs_shapes": [2, 2],
|
||||
"multiagent_act_shapes": [3, 3],
|
||||
"multiagent_shared_model": False,
|
||||
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
|
||||
config["model"].update({"custom_options": options})
|
||||
alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config)
|
||||
for i in range(1):
|
||||
alg.train()
|
||||
@@ -0,0 +1,52 @@
|
||||
import math
|
||||
from gym.spaces import Box, Tuple, Discrete
|
||||
import numpy as np
|
||||
from gym.envs.classic_control.mountain_car import MountainCarEnv
|
||||
|
||||
"""
|
||||
Multiagent mountain car that sums and then
|
||||
averages its actions to produce the velocity
|
||||
"""
|
||||
|
||||
|
||||
class MultiAgentMountainCarEnv(MountainCarEnv):
|
||||
def __init__(self):
|
||||
self.min_position = -1.2
|
||||
self.max_position = 0.6
|
||||
self.max_speed = 0.07
|
||||
self.goal_position = 0.5
|
||||
|
||||
self.low = np.array([self.min_position, -self.max_speed])
|
||||
self.high = np.array([self.max_position, self.max_speed])
|
||||
|
||||
self.viewer = None
|
||||
|
||||
self.action_space = [Discrete(3) for _ in range(2)]
|
||||
self.observation_space = Tuple(tuple(Box(self.low, self.high)
|
||||
for _ in range(2)))
|
||||
|
||||
self._seed()
|
||||
self.reset()
|
||||
|
||||
def _step(self, action):
|
||||
summed_act = 0.5 * np.sum(action)
|
||||
|
||||
position, velocity = self.state
|
||||
velocity += (summed_act - 1) * 0.001
|
||||
velocity += math.cos(3 * position) * (-0.0025)
|
||||
velocity = np.clip(velocity, -self.max_speed, self.max_speed)
|
||||
position += velocity
|
||||
position = np.clip(position, self.min_position, self.max_position)
|
||||
if (position == self.min_position and velocity < 0):
|
||||
velocity = 0
|
||||
|
||||
done = bool(position >= self.goal_position)
|
||||
|
||||
reward = position
|
||||
|
||||
self.state = (position, velocity)
|
||||
return [np.array(self.state) for _ in range(2)], reward, done, {}
|
||||
|
||||
def _reset(self):
|
||||
self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0])
|
||||
return [np.array(self.state) for _ in range(2)]
|
||||
@@ -0,0 +1,56 @@
|
||||
""" Run script for multiagent pendulum env. Each agent outputs a
|
||||
torque which is summed to form the total torque. This is a
|
||||
continuous multiagent example
|
||||
"""
|
||||
|
||||
import gym
|
||||
from gym.envs.registration import register
|
||||
|
||||
import ray
|
||||
import ray.rllib.ppo as ppo
|
||||
from ray.tune.registry import get_registry, register_env
|
||||
|
||||
env_name = "MultiAgentPendulumEnv"
|
||||
|
||||
env_version_num = 0
|
||||
env_name = env_name + '-v' + str(env_version_num)
|
||||
|
||||
|
||||
def pass_params_to_gym(env_name):
|
||||
global env_version_num
|
||||
|
||||
register(
|
||||
id=env_name,
|
||||
entry_point='ray.rllib.examples:' + "MultiAgentPendulumEnv",
|
||||
max_episode_steps=100,
|
||||
kwargs={}
|
||||
)
|
||||
|
||||
|
||||
def create_env(env_config):
|
||||
pass_params_to_gym(env_name)
|
||||
env = gym.envs.make(env_name)
|
||||
return env
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
register_env(env_name, lambda env_config: create_env(env_config))
|
||||
config = ppo.DEFAULT_CONFIG.copy()
|
||||
horizon = 100
|
||||
num_cpus = 2
|
||||
ray.init(num_cpus=num_cpus, redirect_output=False)
|
||||
config["num_workers"] = num_cpus
|
||||
config["timesteps_per_batch"] = 100
|
||||
config["num_sgd_iter"] = 10
|
||||
config["gamma"] = 0.999
|
||||
config["horizon"] = horizon
|
||||
config["use_gae"] = True
|
||||
config["model"].update({"fcnet_hiddens": [256, 256]})
|
||||
options = {"multiagent_obs_shapes": [3, 3],
|
||||
"multiagent_act_shapes": [1, 1],
|
||||
"multiagent_shared_model": True,
|
||||
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
|
||||
config["model"].update({"custom_options": options})
|
||||
alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config)
|
||||
for i in range(1):
|
||||
alg.train()
|
||||
@@ -0,0 +1,70 @@
|
||||
from gym.spaces import Box, Tuple
|
||||
from gym.utils import seeding
|
||||
from gym.envs.classic_control.pendulum import PendulumEnv
|
||||
import numpy as np
|
||||
|
||||
"""
|
||||
Multiagent pendulum that sums its torques to generate an action
|
||||
"""
|
||||
|
||||
|
||||
class MultiAgentPendulumEnv(PendulumEnv):
|
||||
metadata = {
|
||||
'render.modes': ['human', 'rgb_array'],
|
||||
'video.frames_per_second': 30
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.max_speed = 8
|
||||
self.max_torque = 2.
|
||||
self.dt = .05
|
||||
self.viewer = None
|
||||
|
||||
high = np.array([1., 1., self.max_speed])
|
||||
self.action_space = [Box(low=-self.max_torque / 2,
|
||||
high=self.max_torque / 2, shape=(1,))
|
||||
for _ in range(2)]
|
||||
self.observation_space = Tuple(tuple(Box(low=-high, high=high)
|
||||
for _ in range(2)))
|
||||
|
||||
self._seed()
|
||||
|
||||
def _seed(self, seed=None):
|
||||
self.np_random, seed = seeding.np_random(seed)
|
||||
return [seed]
|
||||
|
||||
def _step(self, u):
|
||||
th, thdot = self.state # th := theta
|
||||
|
||||
summed_u = np.sum(u)
|
||||
g = 10.
|
||||
m = 1.
|
||||
length = 1.
|
||||
dt = self.dt
|
||||
|
||||
summed_u = np.clip(summed_u, -self.max_torque, self.max_torque)
|
||||
self.last_u = summed_u # for rendering
|
||||
costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
|
||||
.001 * (summed_u ** 2)
|
||||
|
||||
newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
|
||||
3. / (m * length ** 2) * summed_u) * dt
|
||||
newth = th + newthdot * dt
|
||||
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
|
||||
|
||||
self.state = np.array([newth, newthdot])
|
||||
return self._get_obs(), -costs, False, {}
|
||||
|
||||
def _reset(self):
|
||||
high = np.array([np.pi, 1])
|
||||
self.state = self.np_random.uniform(low=-high, high=high)
|
||||
self.last_u = None
|
||||
return self._get_obs()
|
||||
|
||||
def _get_obs(self):
|
||||
theta, thetadot = self.state
|
||||
return [np.array([np.cos(theta), np.sin(theta), thetadot])
|
||||
for _ in range(2)]
|
||||
|
||||
def angle_normalize(self, x):
|
||||
return (((x + np.pi) % (2 * np.pi)) - np.pi)
|
||||
@@ -5,8 +5,10 @@ from ray.rllib.models.model import Model
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.convnet import ConvolutionalNetwork
|
||||
from ray.rllib.models.lstm import LSTM
|
||||
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
|
||||
|
||||
|
||||
__all__ = ["ActionDistribution", "ActionDistribution", "Categorical",
|
||||
"DiagGaussian", "Deterministic", "ModelCatalog", "Model",
|
||||
"FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM"]
|
||||
"FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM",
|
||||
"MultiAgentFullyConnectedNetwork"]
|
||||
|
||||
@@ -4,6 +4,7 @@ from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
from ray.rllib.utils.reshaper import Reshaper
|
||||
|
||||
|
||||
class ActionDistribution(object):
|
||||
@@ -109,3 +110,49 @@ class Deterministic(ActionDistribution):
|
||||
|
||||
def sample(self):
|
||||
return self.inputs
|
||||
|
||||
|
||||
class MultiActionDistribution(ActionDistribution):
|
||||
"""Action distribution that operates for list of actions.
|
||||
|
||||
Args:
|
||||
inputs (Tensor list): A list of tensors from which to compute samples.
|
||||
"""
|
||||
def __init__(self, inputs, action_space, child_distributions):
|
||||
# you actually have to instantiate the child distributions
|
||||
self.reshaper = Reshaper(action_space)
|
||||
split_inputs = self.reshaper.split_tensor(inputs)
|
||||
child_list = []
|
||||
for i, distribution in enumerate(child_distributions):
|
||||
child_list.append(distribution(split_inputs[i]))
|
||||
self.child_distributions = child_list
|
||||
|
||||
def logp(self, x):
|
||||
"""The log-likelihood of the action distribution."""
|
||||
split_list = self.reshaper.split_tensor(x)
|
||||
for i, distribution in enumerate(self.child_distributions):
|
||||
# Remove extra categorical dimension
|
||||
if isinstance(distribution, Categorical):
|
||||
split_list[i] = tf.squeeze(split_list[i], axis=-1)
|
||||
log_list = np.asarray([distribution.logp(split_x) for
|
||||
distribution, split_x in
|
||||
zip(self.child_distributions, split_list)])
|
||||
return np.sum(log_list)
|
||||
|
||||
def kl(self, other):
|
||||
"""The KL-divergence between two action distributions."""
|
||||
kl_list = np.asarray([distribution.kl(other_distribution) for
|
||||
distribution, other_distribution in
|
||||
zip(self.child_distributions,
|
||||
other.child_distributions)])
|
||||
return np.sum(kl_list)
|
||||
|
||||
def entropy(self):
|
||||
"""The entropy of the action distribution."""
|
||||
entropy_list = np.array([s.entropy() for s in
|
||||
self.child_distributions])
|
||||
return np.sum(entropy_list)
|
||||
|
||||
def sample(self):
|
||||
"""Draw a sample from the action distribution."""
|
||||
return [[s.sample() for s in self.child_distributions]]
|
||||
|
||||
@@ -3,15 +3,19 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
from functools import partial
|
||||
|
||||
from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \
|
||||
_default_registry
|
||||
|
||||
from ray.rllib.models.action_dist import (
|
||||
Categorical, Deterministic, DiagGaussian)
|
||||
Categorical, Deterministic, DiagGaussian, MultiActionDistribution)
|
||||
from ray.rllib.models.preprocessors import get_preprocessor
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.visionnet import VisionNetwork
|
||||
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
|
||||
|
||||
|
||||
MODEL_CONFIGS = [
|
||||
@@ -66,10 +70,49 @@ class ModelCatalog(object):
|
||||
return Deterministic, action_space.shape[0]
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
return Categorical, action_space.n
|
||||
elif isinstance(action_space, list):
|
||||
size = 0
|
||||
child_dist = []
|
||||
for action in action_space:
|
||||
dist, action_size = ModelCatalog.get_action_dist(action)
|
||||
child_dist.append(dist)
|
||||
size += action_size
|
||||
return partial(MultiActionDistribution,
|
||||
child_distributions=child_dist,
|
||||
action_space=action_space), size
|
||||
|
||||
raise NotImplementedError(
|
||||
"Unsupported args: {} {}".format(action_space, dist_type))
|
||||
|
||||
@staticmethod
|
||||
def get_action_placeholder(action_space):
|
||||
"""Returns an action placeholder that is consistent with the action space
|
||||
|
||||
Args:
|
||||
action_space (Space): Action space of the target gym env.
|
||||
Returns:
|
||||
action_placeholder (Tensor): A placeholder for the actions
|
||||
"""
|
||||
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
return tf.placeholder(
|
||||
tf.float32, shape=(None, action_space.shape[0]))
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
return tf.placeholder(tf.int64, shape=(None,))
|
||||
elif isinstance(action_space, list):
|
||||
size = 0
|
||||
for i in range(len(action_space)):
|
||||
size += np.product(action_space[i].shape)
|
||||
# TODO(ev) this obviously won't work for mixed spaces
|
||||
if isinstance(action_space[0], gym.spaces.Discrete):
|
||||
return tf.placeholder(tf.int64, shape=(None,
|
||||
len(action_space)))
|
||||
elif isinstance(action_space[0], gym.spaces.Box):
|
||||
return tf.placeholder(tf.float32, shape=(None, size))
|
||||
else:
|
||||
raise NotImplementedError("action space {}"
|
||||
" not supported".format(action_space))
|
||||
|
||||
@staticmethod
|
||||
def get_model(registry, inputs, num_outputs, options=dict()):
|
||||
"""Returns a suitable model conforming to given input and output specs.
|
||||
@@ -92,6 +135,12 @@ class ModelCatalog(object):
|
||||
|
||||
obs_rank = len(inputs.shape) - 1
|
||||
|
||||
# num_outputs > 1 used to avoid hitting this with the value function
|
||||
if isinstance(options.get("custom_options", {}).get(
|
||||
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
|
||||
return MultiAgentFullyConnectedNetwork(inputs,
|
||||
num_outputs, options)
|
||||
|
||||
if obs_rank > 1:
|
||||
return VisionNetwork(inputs, num_outputs, options)
|
||||
|
||||
@@ -141,7 +190,6 @@ class ModelCatalog(object):
|
||||
Returns:
|
||||
preprocessor (Preprocessor): Preprocessor for the env observations.
|
||||
"""
|
||||
|
||||
for k in options.keys():
|
||||
if k not in MODEL_CONFIGS:
|
||||
raise Exception(
|
||||
|
||||
@@ -14,6 +14,7 @@ class FullyConnectedNetwork(Model):
|
||||
|
||||
def _init(self, inputs, num_outputs, options):
|
||||
hiddens = options.get("fcnet_hiddens", [256, 256])
|
||||
|
||||
fcnet_activation = options.get("fcnet_activation", "tanh")
|
||||
if fcnet_activation == "tanh":
|
||||
activation = tf.nn.tanh
|
||||
@@ -25,14 +26,16 @@ class FullyConnectedNetwork(Model):
|
||||
i = 1
|
||||
last_layer = inputs
|
||||
for size in hiddens:
|
||||
label = "fc{}".format(i)
|
||||
last_layer = slim.fully_connected(
|
||||
last_layer, size,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
activation_fn=activation,
|
||||
scope="fc{}".format(i))
|
||||
scope=label)
|
||||
i += 1
|
||||
label = "fc_out"
|
||||
output = slim.fully_connected(
|
||||
last_layer, num_outputs,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope="fc_out")
|
||||
activation_fn=None, scope=label)
|
||||
return output, last_layer
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.models.model import Model
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.action_dist import Reshaper
|
||||
|
||||
|
||||
class MultiAgentFullyConnectedNetwork(Model):
|
||||
"""Multiagent fully connected network."""
|
||||
|
||||
def _init(self, inputs, num_outputs, options):
|
||||
|
||||
# Split the input and output tensors
|
||||
input_shapes = options["custom_options"]["multiagent_obs_shapes"]
|
||||
output_shapes = options["custom_options"]["multiagent_act_shapes"]
|
||||
input_reshaper = Reshaper(input_shapes)
|
||||
output_reshaper = Reshaper(output_shapes)
|
||||
split_inputs = input_reshaper.split_tensor(inputs)
|
||||
num_actions = output_reshaper.split_number(num_outputs)
|
||||
|
||||
custom_options = options["custom_options"]
|
||||
hiddens = custom_options.get("multiagent_fcnet_hiddens",
|
||||
[[256, 256]]*1)
|
||||
|
||||
# check for a shared model
|
||||
shared_model = custom_options.get("multiagent_shared_model", 0)
|
||||
reuse = tf.AUTO_REUSE if shared_model else False
|
||||
outputs = []
|
||||
for i in range(len(hiddens)):
|
||||
with tf.variable_scope("multi{}".format(i), reuse=reuse):
|
||||
sub_options = options.copy()
|
||||
sub_options.update({"fcnet_hiddens": hiddens[i]})
|
||||
# TODO(ev) make this support arbitrary networks
|
||||
fcnet = FullyConnectedNetwork(
|
||||
split_inputs[i], int(num_actions[i]), sub_options)
|
||||
output = fcnet.outputs
|
||||
outputs.append(output)
|
||||
overall_output = tf.concat(outputs, axis=1)
|
||||
return overall_output, outputs
|
||||
@@ -2,7 +2,6 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym.spaces
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.models import ModelCatalog
|
||||
@@ -18,8 +17,6 @@ class ProximalPolicyLoss(object):
|
||||
observations, value_targets, advantages, actions,
|
||||
prev_logits, prev_vf_preds, logit_dim,
|
||||
kl_coeff, distribution_class, config, sess, registry):
|
||||
assert (isinstance(action_space, gym.spaces.Discrete) or
|
||||
isinstance(action_space, gym.spaces.Box))
|
||||
self.prev_dist = distribution_class(prev_logits)
|
||||
|
||||
# Saved so that we can compute actions given different observations
|
||||
|
||||
@@ -220,7 +220,6 @@ class PPOAgent(Agent):
|
||||
self.local_evaluator.filters, self.remote_evaluators)
|
||||
res = self._fetch_metrics_from_remote_evaluators()
|
||||
res = res._replace(info=info)
|
||||
|
||||
return res
|
||||
|
||||
def _fetch_metrics_from_remote_evaluators(self):
|
||||
|
||||
@@ -2,7 +2,6 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym.spaces
|
||||
import pickle
|
||||
import tensorflow as tf
|
||||
import os
|
||||
@@ -68,16 +67,7 @@ class PPOEvaluator(Evaluator):
|
||||
self.advantages = tf.placeholder(tf.float32, shape=(None,))
|
||||
|
||||
action_space = self.env.action_space
|
||||
# TODO(rliaw): pull this into model_catalog
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
self.actions = tf.placeholder(
|
||||
tf.float32, shape=(None, action_space.shape[0]))
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.actions = tf.placeholder(tf.int64, shape=(None,))
|
||||
else:
|
||||
raise NotImplemented(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.actions = ModelCatalog.get_action_placeholder(action_space)
|
||||
self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space)
|
||||
# Log probabilities from the policy before the policy update.
|
||||
|
||||
@@ -0,0 +1,46 @@
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
class Reshaper(object):
|
||||
"""
|
||||
This class keeps track of where in the flattened observation space
|
||||
we should be slicing and what the new shapes should be
|
||||
"""
|
||||
def __init__(self, env_space):
|
||||
self.shapes = []
|
||||
self.slice_positions = []
|
||||
self.env_space = env_space
|
||||
if isinstance(env_space, list):
|
||||
for space in env_space:
|
||||
# Handle both gym arrays and just lists of inputs length
|
||||
if hasattr(space, "shape"):
|
||||
arr_shape = np.asarray(space.shape)
|
||||
else:
|
||||
arr_shape = space
|
||||
self.shapes.append(arr_shape)
|
||||
if len(self.slice_positions) == 0:
|
||||
self.slice_positions.append(np.product(arr_shape))
|
||||
else:
|
||||
self.slice_positions.append(np.product(arr_shape) +
|
||||
self.slice_positions[-1])
|
||||
else:
|
||||
self.shapes.append(np.asarray(env_space.shape))
|
||||
self.slice_positions.append(np.product(env_space.shape))
|
||||
|
||||
def get_slice_lengths(self):
|
||||
diffed_list = np.diff(self.slice_positions).tolist()
|
||||
diffed_list.insert(0, self.slice_positions[0])
|
||||
return np.asarray(diffed_list).astype(int)
|
||||
|
||||
def split_tensor(self, tensor, axis=-1):
|
||||
# FIXME (ev) This won't work for mixed action distributions like
|
||||
# one agent Gaussian one agent discrete
|
||||
slice_rescale = int(tensor.shape.as_list()[axis] /
|
||||
int(np.sum(self.get_slice_lengths())))
|
||||
return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
|
||||
axis=axis)
|
||||
|
||||
def split_number(self, number):
|
||||
slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
|
||||
return slice_rescale*self.get_slice_lengths()
|
||||
@@ -5,6 +5,7 @@ from __future__ import print_function
|
||||
import six.moves.queue as queue
|
||||
import threading
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
|
||||
|
||||
class PartialRollout(object):
|
||||
@@ -226,6 +227,10 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
if length >= horizon:
|
||||
terminal = True
|
||||
|
||||
# Concatenate multiagent actions
|
||||
if isinstance(action, list):
|
||||
action = np.concatenate(action, axis=0).flatten()
|
||||
|
||||
# Collect the experience.
|
||||
rollout.add(observations=last_observation,
|
||||
actions=action,
|
||||
|
||||
Reference in New Issue
Block a user