diff --git a/python/ray/rllib/examples/__init__.py b/python/ray/rllib/examples/__init__.py new file mode 100644 index 000000000..bcedb9af0 --- /dev/null +++ b/python/ray/rllib/examples/__init__.py @@ -0,0 +1,5 @@ +# flake8: noqa +from ray.rllib.examples.multiagent_mountaincar_env \ + import MultiAgentMountainCarEnv +from ray.rllib.examples.multiagent_pendulum_env \ + import MultiAgentPendulumEnv diff --git a/python/ray/rllib/examples/multiagent_mountaincar.py b/python/ray/rllib/examples/multiagent_mountaincar.py new file mode 100644 index 000000000..f585dde1f --- /dev/null +++ b/python/ray/rllib/examples/multiagent_mountaincar.py @@ -0,0 +1,56 @@ +""" Multiagent mountain car. Each agent outputs an action which +is summed to form the total action. This is a discrete +multiagent example +""" + +import gym +from gym.envs.registration import register + +import ray +import ray.rllib.ppo as ppo +from ray.tune.registry import get_registry, register_env + +env_name = "MultiAgentMountainCarEnv" + +env_version_num = 0 +env_name = env_name + '-v' + str(env_version_num) + + +def pass_params_to_gym(env_name): + global env_version_num + + register( + id=env_name, + entry_point='ray.rllib.examples:' + "MultiAgentMountainCarEnv", + max_episode_steps=200, + kwargs={} + ) + + +def create_env(env_config): + pass_params_to_gym(env_name) + env = gym.envs.make(env_name) + return env + + +if __name__ == '__main__': + register_env(env_name, lambda env_config: create_env(env_config)) + config = ppo.DEFAULT_CONFIG.copy() + horizon = 200 + num_cpus = 2 + ray.init(num_cpus=num_cpus, redirect_output=False) + config["num_workers"] = num_cpus + config["timesteps_per_batch"] = 100 + config["num_sgd_iter"] = 10 + config["gamma"] = 0.999 + config["horizon"] = horizon + config["use_gae"] = True + config["model"].update({"fcnet_hiddens": [256, 256]}) + options = {"multiagent_obs_shapes": [2, 2], + "multiagent_act_shapes": [3, 3], + "multiagent_shared_model": False, + "multiagent_fcnet_hiddens": [[32, 32]] * 2} + config["model"].update({"custom_options": options}) + alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config) + for i in range(1): + alg.train() diff --git a/python/ray/rllib/examples/multiagent_mountaincar_env.py b/python/ray/rllib/examples/multiagent_mountaincar_env.py new file mode 100644 index 000000000..3018aa7f3 --- /dev/null +++ b/python/ray/rllib/examples/multiagent_mountaincar_env.py @@ -0,0 +1,52 @@ +import math +from gym.spaces import Box, Tuple, Discrete +import numpy as np +from gym.envs.classic_control.mountain_car import MountainCarEnv + +""" +Multiagent mountain car that sums and then +averages its actions to produce the velocity +""" + + +class MultiAgentMountainCarEnv(MountainCarEnv): + def __init__(self): + self.min_position = -1.2 + self.max_position = 0.6 + self.max_speed = 0.07 + self.goal_position = 0.5 + + self.low = np.array([self.min_position, -self.max_speed]) + self.high = np.array([self.max_position, self.max_speed]) + + self.viewer = None + + self.action_space = [Discrete(3) for _ in range(2)] + self.observation_space = Tuple(tuple(Box(self.low, self.high) + for _ in range(2))) + + self._seed() + self.reset() + + def _step(self, action): + summed_act = 0.5 * np.sum(action) + + position, velocity = self.state + velocity += (summed_act - 1) * 0.001 + velocity += math.cos(3 * position) * (-0.0025) + velocity = np.clip(velocity, -self.max_speed, self.max_speed) + position += velocity + position = np.clip(position, self.min_position, self.max_position) + if (position == self.min_position and velocity < 0): + velocity = 0 + + done = bool(position >= self.goal_position) + + reward = position + + self.state = (position, velocity) + return [np.array(self.state) for _ in range(2)], reward, done, {} + + def _reset(self): + self.state = np.array([self.np_random.uniform(low=-0.6, high=-0.4), 0]) + return [np.array(self.state) for _ in range(2)] diff --git a/python/ray/rllib/examples/multiagent_pendulum.py b/python/ray/rllib/examples/multiagent_pendulum.py new file mode 100644 index 000000000..9e629ed07 --- /dev/null +++ b/python/ray/rllib/examples/multiagent_pendulum.py @@ -0,0 +1,56 @@ +""" Run script for multiagent pendulum env. Each agent outputs a +torque which is summed to form the total torque. This is a +continuous multiagent example +""" + +import gym +from gym.envs.registration import register + +import ray +import ray.rllib.ppo as ppo +from ray.tune.registry import get_registry, register_env + +env_name = "MultiAgentPendulumEnv" + +env_version_num = 0 +env_name = env_name + '-v' + str(env_version_num) + + +def pass_params_to_gym(env_name): + global env_version_num + + register( + id=env_name, + entry_point='ray.rllib.examples:' + "MultiAgentPendulumEnv", + max_episode_steps=100, + kwargs={} + ) + + +def create_env(env_config): + pass_params_to_gym(env_name) + env = gym.envs.make(env_name) + return env + + +if __name__ == '__main__': + register_env(env_name, lambda env_config: create_env(env_config)) + config = ppo.DEFAULT_CONFIG.copy() + horizon = 100 + num_cpus = 2 + ray.init(num_cpus=num_cpus, redirect_output=False) + config["num_workers"] = num_cpus + config["timesteps_per_batch"] = 100 + config["num_sgd_iter"] = 10 + config["gamma"] = 0.999 + config["horizon"] = horizon + config["use_gae"] = True + config["model"].update({"fcnet_hiddens": [256, 256]}) + options = {"multiagent_obs_shapes": [3, 3], + "multiagent_act_shapes": [1, 1], + "multiagent_shared_model": True, + "multiagent_fcnet_hiddens": [[32, 32]] * 2} + config["model"].update({"custom_options": options}) + alg = ppo.PPOAgent(env=env_name, registry=get_registry(), config=config) + for i in range(1): + alg.train() diff --git a/python/ray/rllib/examples/multiagent_pendulum_env.py b/python/ray/rllib/examples/multiagent_pendulum_env.py new file mode 100644 index 000000000..da727bcaa --- /dev/null +++ b/python/ray/rllib/examples/multiagent_pendulum_env.py @@ -0,0 +1,70 @@ +from gym.spaces import Box, Tuple +from gym.utils import seeding +from gym.envs.classic_control.pendulum import PendulumEnv +import numpy as np + +""" + Multiagent pendulum that sums its torques to generate an action +""" + + +class MultiAgentPendulumEnv(PendulumEnv): + metadata = { + 'render.modes': ['human', 'rgb_array'], + 'video.frames_per_second': 30 + } + + def __init__(self): + self.max_speed = 8 + self.max_torque = 2. + self.dt = .05 + self.viewer = None + + high = np.array([1., 1., self.max_speed]) + self.action_space = [Box(low=-self.max_torque / 2, + high=self.max_torque / 2, shape=(1,)) + for _ in range(2)] + self.observation_space = Tuple(tuple(Box(low=-high, high=high) + for _ in range(2))) + + self._seed() + + def _seed(self, seed=None): + self.np_random, seed = seeding.np_random(seed) + return [seed] + + def _step(self, u): + th, thdot = self.state # th := theta + + summed_u = np.sum(u) + g = 10. + m = 1. + length = 1. + dt = self.dt + + summed_u = np.clip(summed_u, -self.max_torque, self.max_torque) + self.last_u = summed_u # for rendering + costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \ + .001 * (summed_u ** 2) + + newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + + 3. / (m * length ** 2) * summed_u) * dt + newth = th + newthdot * dt + newthdot = np.clip(newthdot, -self.max_speed, self.max_speed) + + self.state = np.array([newth, newthdot]) + return self._get_obs(), -costs, False, {} + + def _reset(self): + high = np.array([np.pi, 1]) + self.state = self.np_random.uniform(low=-high, high=high) + self.last_u = None + return self._get_obs() + + def _get_obs(self): + theta, thetadot = self.state + return [np.array([np.cos(theta), np.sin(theta), thetadot]) + for _ in range(2)] + + def angle_normalize(self, x): + return (((x + np.pi) % (2 * np.pi)) - np.pi) diff --git a/python/ray/rllib/models/__init__.py b/python/ray/rllib/models/__init__.py index 61e554823..af3ac81dc 100644 --- a/python/ray/rllib/models/__init__.py +++ b/python/ray/rllib/models/__init__.py @@ -5,8 +5,10 @@ from ray.rllib.models.model import Model from ray.rllib.models.fcnet import FullyConnectedNetwork from ray.rllib.models.convnet import ConvolutionalNetwork from ray.rllib.models.lstm import LSTM +from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork __all__ = ["ActionDistribution", "ActionDistribution", "Categorical", "DiagGaussian", "Deterministic", "ModelCatalog", "Model", - "FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM"] + "FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM", + "MultiAgentFullyConnectedNetwork"] diff --git a/python/ray/rllib/models/action_dist.py b/python/ray/rllib/models/action_dist.py index ac6fc671e..459226b94 100644 --- a/python/ray/rllib/models/action_dist.py +++ b/python/ray/rllib/models/action_dist.py @@ -4,6 +4,7 @@ from __future__ import print_function import tensorflow as tf import numpy as np +from ray.rllib.utils.reshaper import Reshaper class ActionDistribution(object): @@ -109,3 +110,49 @@ class Deterministic(ActionDistribution): def sample(self): return self.inputs + + +class MultiActionDistribution(ActionDistribution): + """Action distribution that operates for list of actions. + + Args: + inputs (Tensor list): A list of tensors from which to compute samples. + """ + def __init__(self, inputs, action_space, child_distributions): + # you actually have to instantiate the child distributions + self.reshaper = Reshaper(action_space) + split_inputs = self.reshaper.split_tensor(inputs) + child_list = [] + for i, distribution in enumerate(child_distributions): + child_list.append(distribution(split_inputs[i])) + self.child_distributions = child_list + + def logp(self, x): + """The log-likelihood of the action distribution.""" + split_list = self.reshaper.split_tensor(x) + for i, distribution in enumerate(self.child_distributions): + # Remove extra categorical dimension + if isinstance(distribution, Categorical): + split_list[i] = tf.squeeze(split_list[i], axis=-1) + log_list = np.asarray([distribution.logp(split_x) for + distribution, split_x in + zip(self.child_distributions, split_list)]) + return np.sum(log_list) + + def kl(self, other): + """The KL-divergence between two action distributions.""" + kl_list = np.asarray([distribution.kl(other_distribution) for + distribution, other_distribution in + zip(self.child_distributions, + other.child_distributions)]) + return np.sum(kl_list) + + def entropy(self): + """The entropy of the action distribution.""" + entropy_list = np.array([s.entropy() for s in + self.child_distributions]) + return np.sum(entropy_list) + + def sample(self): + """Draw a sample from the action distribution.""" + return [[s.sample() for s in self.child_distributions]] diff --git a/python/ray/rllib/models/catalog.py b/python/ray/rllib/models/catalog.py index bd490cc0b..4d4f2f3d3 100644 --- a/python/ray/rllib/models/catalog.py +++ b/python/ray/rllib/models/catalog.py @@ -3,15 +3,19 @@ from __future__ import division from __future__ import print_function import gym +import numpy as np +import tensorflow as tf +from functools import partial from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \ _default_registry from ray.rllib.models.action_dist import ( - Categorical, Deterministic, DiagGaussian) + Categorical, Deterministic, DiagGaussian, MultiActionDistribution) from ray.rllib.models.preprocessors import get_preprocessor from ray.rllib.models.fcnet import FullyConnectedNetwork from ray.rllib.models.visionnet import VisionNetwork +from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork MODEL_CONFIGS = [ @@ -66,10 +70,49 @@ class ModelCatalog(object): return Deterministic, action_space.shape[0] elif isinstance(action_space, gym.spaces.Discrete): return Categorical, action_space.n + elif isinstance(action_space, list): + size = 0 + child_dist = [] + for action in action_space: + dist, action_size = ModelCatalog.get_action_dist(action) + child_dist.append(dist) + size += action_size + return partial(MultiActionDistribution, + child_distributions=child_dist, + action_space=action_space), size raise NotImplementedError( "Unsupported args: {} {}".format(action_space, dist_type)) + @staticmethod + def get_action_placeholder(action_space): + """Returns an action placeholder that is consistent with the action space + + Args: + action_space (Space): Action space of the target gym env. + Returns: + action_placeholder (Tensor): A placeholder for the actions + """ + + if isinstance(action_space, gym.spaces.Box): + return tf.placeholder( + tf.float32, shape=(None, action_space.shape[0])) + elif isinstance(action_space, gym.spaces.Discrete): + return tf.placeholder(tf.int64, shape=(None,)) + elif isinstance(action_space, list): + size = 0 + for i in range(len(action_space)): + size += np.product(action_space[i].shape) + # TODO(ev) this obviously won't work for mixed spaces + if isinstance(action_space[0], gym.spaces.Discrete): + return tf.placeholder(tf.int64, shape=(None, + len(action_space))) + elif isinstance(action_space[0], gym.spaces.Box): + return tf.placeholder(tf.float32, shape=(None, size)) + else: + raise NotImplementedError("action space {}" + " not supported".format(action_space)) + @staticmethod def get_model(registry, inputs, num_outputs, options=dict()): """Returns a suitable model conforming to given input and output specs. @@ -92,6 +135,12 @@ class ModelCatalog(object): obs_rank = len(inputs.shape) - 1 + # num_outputs > 1 used to avoid hitting this with the value function + if isinstance(options.get("custom_options", {}).get( + "multiagent_fcnet_hiddens", 1), list) and num_outputs > 1: + return MultiAgentFullyConnectedNetwork(inputs, + num_outputs, options) + if obs_rank > 1: return VisionNetwork(inputs, num_outputs, options) @@ -141,7 +190,6 @@ class ModelCatalog(object): Returns: preprocessor (Preprocessor): Preprocessor for the env observations. """ - for k in options.keys(): if k not in MODEL_CONFIGS: raise Exception( diff --git a/python/ray/rllib/models/fcnet.py b/python/ray/rllib/models/fcnet.py index 43a1ab031..0bcbd68b0 100644 --- a/python/ray/rllib/models/fcnet.py +++ b/python/ray/rllib/models/fcnet.py @@ -14,6 +14,7 @@ class FullyConnectedNetwork(Model): def _init(self, inputs, num_outputs, options): hiddens = options.get("fcnet_hiddens", [256, 256]) + fcnet_activation = options.get("fcnet_activation", "tanh") if fcnet_activation == "tanh": activation = tf.nn.tanh @@ -25,14 +26,16 @@ class FullyConnectedNetwork(Model): i = 1 last_layer = inputs for size in hiddens: + label = "fc{}".format(i) last_layer = slim.fully_connected( last_layer, size, weights_initializer=normc_initializer(1.0), activation_fn=activation, - scope="fc{}".format(i)) + scope=label) i += 1 + label = "fc_out" output = slim.fully_connected( last_layer, num_outputs, weights_initializer=normc_initializer(0.01), - activation_fn=None, scope="fc_out") + activation_fn=None, scope=label) return output, last_layer diff --git a/python/ray/rllib/models/multiagentfcnet.py b/python/ray/rllib/models/multiagentfcnet.py new file mode 100644 index 000000000..6f205d5ee --- /dev/null +++ b/python/ray/rllib/models/multiagentfcnet.py @@ -0,0 +1,43 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from ray.rllib.models.model import Model +from ray.rllib.models.fcnet import FullyConnectedNetwork +from ray.rllib.models.action_dist import Reshaper + + +class MultiAgentFullyConnectedNetwork(Model): + """Multiagent fully connected network.""" + + def _init(self, inputs, num_outputs, options): + + # Split the input and output tensors + input_shapes = options["custom_options"]["multiagent_obs_shapes"] + output_shapes = options["custom_options"]["multiagent_act_shapes"] + input_reshaper = Reshaper(input_shapes) + output_reshaper = Reshaper(output_shapes) + split_inputs = input_reshaper.split_tensor(inputs) + num_actions = output_reshaper.split_number(num_outputs) + + custom_options = options["custom_options"] + hiddens = custom_options.get("multiagent_fcnet_hiddens", + [[256, 256]]*1) + + # check for a shared model + shared_model = custom_options.get("multiagent_shared_model", 0) + reuse = tf.AUTO_REUSE if shared_model else False + outputs = [] + for i in range(len(hiddens)): + with tf.variable_scope("multi{}".format(i), reuse=reuse): + sub_options = options.copy() + sub_options.update({"fcnet_hiddens": hiddens[i]}) + # TODO(ev) make this support arbitrary networks + fcnet = FullyConnectedNetwork( + split_inputs[i], int(num_actions[i]), sub_options) + output = fcnet.outputs + outputs.append(output) + overall_output = tf.concat(outputs, axis=1) + return overall_output, outputs diff --git a/python/ray/rllib/ppo/loss.py b/python/ray/rllib/ppo/loss.py index d5dfc7ca9..3f69ff711 100644 --- a/python/ray/rllib/ppo/loss.py +++ b/python/ray/rllib/ppo/loss.py @@ -2,7 +2,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import gym.spaces import tensorflow as tf from ray.rllib.models import ModelCatalog @@ -18,8 +17,6 @@ class ProximalPolicyLoss(object): observations, value_targets, advantages, actions, prev_logits, prev_vf_preds, logit_dim, kl_coeff, distribution_class, config, sess, registry): - assert (isinstance(action_space, gym.spaces.Discrete) or - isinstance(action_space, gym.spaces.Box)) self.prev_dist = distribution_class(prev_logits) # Saved so that we can compute actions given different observations diff --git a/python/ray/rllib/ppo/ppo.py b/python/ray/rllib/ppo/ppo.py index 841386d2b..3491d6cca 100644 --- a/python/ray/rllib/ppo/ppo.py +++ b/python/ray/rllib/ppo/ppo.py @@ -220,7 +220,6 @@ class PPOAgent(Agent): self.local_evaluator.filters, self.remote_evaluators) res = self._fetch_metrics_from_remote_evaluators() res = res._replace(info=info) - return res def _fetch_metrics_from_remote_evaluators(self): diff --git a/python/ray/rllib/ppo/ppo_evaluator.py b/python/ray/rllib/ppo/ppo_evaluator.py index 7eb0291e2..c28e38404 100644 --- a/python/ray/rllib/ppo/ppo_evaluator.py +++ b/python/ray/rllib/ppo/ppo_evaluator.py @@ -2,7 +2,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import gym.spaces import pickle import tensorflow as tf import os @@ -68,16 +67,7 @@ class PPOEvaluator(Evaluator): self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space - # TODO(rliaw): pull this into model_catalog - if isinstance(action_space, gym.spaces.Box): - self.actions = tf.placeholder( - tf.float32, shape=(None, action_space.shape[0])) - elif isinstance(action_space, gym.spaces.Discrete): - self.actions = tf.placeholder(tf.int64, shape=(None,)) - else: - raise NotImplemented( - "action space" + str(type(action_space)) + - "currently not supported") + self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. diff --git a/python/ray/rllib/utils/reshaper.py b/python/ray/rllib/utils/reshaper.py new file mode 100644 index 000000000..cdc523c03 --- /dev/null +++ b/python/ray/rllib/utils/reshaper.py @@ -0,0 +1,46 @@ +import numpy as np +import tensorflow as tf + + +class Reshaper(object): + """ + This class keeps track of where in the flattened observation space + we should be slicing and what the new shapes should be + """ + def __init__(self, env_space): + self.shapes = [] + self.slice_positions = [] + self.env_space = env_space + if isinstance(env_space, list): + for space in env_space: + # Handle both gym arrays and just lists of inputs length + if hasattr(space, "shape"): + arr_shape = np.asarray(space.shape) + else: + arr_shape = space + self.shapes.append(arr_shape) + if len(self.slice_positions) == 0: + self.slice_positions.append(np.product(arr_shape)) + else: + self.slice_positions.append(np.product(arr_shape) + + self.slice_positions[-1]) + else: + self.shapes.append(np.asarray(env_space.shape)) + self.slice_positions.append(np.product(env_space.shape)) + + def get_slice_lengths(self): + diffed_list = np.diff(self.slice_positions).tolist() + diffed_list.insert(0, self.slice_positions[0]) + return np.asarray(diffed_list).astype(int) + + def split_tensor(self, tensor, axis=-1): + # FIXME (ev) This won't work for mixed action distributions like + # one agent Gaussian one agent discrete + slice_rescale = int(tensor.shape.as_list()[axis] / + int(np.sum(self.get_slice_lengths()))) + return tf.split(tensor, slice_rescale*self.get_slice_lengths(), + axis=axis) + + def split_number(self, number): + slice_rescale = int(number / int(np.sum(self.get_slice_lengths()))) + return slice_rescale*self.get_slice_lengths() diff --git a/python/ray/rllib/utils/sampler.py b/python/ray/rllib/utils/sampler.py index d846ca5ac..92bf3cdea 100644 --- a/python/ray/rllib/utils/sampler.py +++ b/python/ray/rllib/utils/sampler.py @@ -5,6 +5,7 @@ from __future__ import print_function import six.moves.queue as queue import threading from collections import namedtuple +import numpy as np class PartialRollout(object): @@ -226,6 +227,10 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter): if length >= horizon: terminal = True + # Concatenate multiagent actions + if isinstance(action, list): + action = np.concatenate(action, axis=0).flatten() + # Collect the experience. rollout.add(observations=last_observation, actions=action, diff --git a/test/jenkins_tests/run_multi_node_tests.sh b/test/jenkins_tests/run_multi_node_tests.sh index 0e3d691c1..255ba0f92 100755 --- a/test/jenkins_tests/run_multi_node_tests.sh +++ b/test/jenkins_tests/run_multi_node_tests.sh @@ -162,3 +162,9 @@ docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ python /ray/python/ray/tune/examples/tune_mnist_ray.py \ --fast + +docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ + python /ray/python/ray/rllib/examples/multiagent_mountaincar.py + +docker run --rm --shm-size=10G --memory=10G $DOCKER_SHA \ + python /ray/python/ray/rllib/examples/multiagent_pendulum.py \ No newline at end of file