[rllib] Modularize Torch and TF policy graphs (#2294)

* wip

* cls

* re

* wip

* wip

* a3c working

* torch support

* pg works

* lint

* rm v2

* consumer id

* clean up pg

* clean up more

* fix python 2.7

* tf session management

* docs

* dqn wip

* fix compile

* dqn

* apex runs

* up

* impotrs

* ddpg

* quotes

* fix tests

* fix last r

* fix tests

* lint

* pass checkpoint restore

* kwar

* nits

* policy graph

* fix yapf

* com

* class

* pyt

* vectorization

* update

* test cpe

* unit test

* fix ddpg2

* changes

* wip

* args

* faster test

* common

* fix

* add alg option

* batch mode and policy serving

* multi serving test

* todo

* wip

* serving test

* doc async env

* num envs

* comments

* thread

* remove init hook

* update

* fix ppo

* comments1

* fix

* updates

* add jenkins tests

* fix

* fix pytorch

* fix

* fixes

* fix a3c policy

* fix squeeze

* fix trunc on apex

* fix squeezing for real

* update

* remove horizon test for now

* multiagent wip

* update

* fix race condition

* fix ma

* t

* doc

* st

* wip

* example

* wip

* working

* cartpole

* wip

* batch wip

* fix bug

* make other_batches None default

* working

* debug

* nit

* warn

* comments

* fix ppo

* fix obs filter

* update

* wip

* tf

* update

* fix

* cleanup

* cleanup

* spacing

* model

* fix

* dqn

* fix ddpg

* doc

* keep names

* update

* fix

* com

* docs

* clarify model outputs

* Update torch_policy_graph.py

* fix obs filter

* pass thru worker index

* fix

* rename

* vlad torch comments

* fix log action

* debug name

* fix lstm

* remove unused ddpg net

* remove conv net

* revert lstm

* cast

* clean up

* fix lstm check

* move to end

* fix sphinx

* fix cmd

* remove bad doc

* clarify

* copy

* async sa

* fix
This commit is contained in:
Eric Liang
2018-06-26 13:17:15 -07:00
committed by GitHub
parent a9a26b7560
commit 1251abf0d1
31 changed files with 687 additions and 792 deletions
+8 -4
View File
@@ -11,7 +11,6 @@ from ray.rllib.optimizers import AsyncOptimizer
from ray.rllib.utils import FilterManager
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
collect_metrics
from ray.rllib.a3c.common import get_policy_cls
from ray.tune.trial import Resources
DEFAULT_CONFIG = {
@@ -21,8 +20,6 @@ DEFAULT_CONFIG = {
"num_envs": 1,
# Size of rollout batch
"batch_size": 10,
# Use LSTM model - only applicable for image states
"use_lstm": False,
# Use PyTorch as backend - no LSTM support
"use_pytorch": False,
# Which observation filter to apply to the observation
@@ -47,6 +44,8 @@ DEFAULT_CONFIG = {
"summarize": False,
# Model and preprocessor options
"model": {
# Use LSTM model - only applicable for image states. Requires TF.
"use_lstm": False,
# (Image statespace) - Converts image to Channels = 1
"grayscale": True,
# (Image statespace) - Each pixel
@@ -86,7 +85,12 @@ class A3CAgent(Agent):
extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)
def _init(self):
self.policy_cls = get_policy_cls(self.config)
if self.config["use_pytorch"]:
from ray.rllib.a3c.a3c_torch_policy import A3CTorchPolicyGraph
self.policy_cls = A3CTorchPolicyGraph
else:
from ray.rllib.a3c.a3c_tf_policy import A3CPolicyGraph
self.policy_cls = A3CPolicyGraph
if self.config["use_pytorch"]:
session_creator = None
+85 -51
View File
@@ -7,90 +7,124 @@ import gym
import ray
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.postprocessing import compute_advantages
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
from ray.rllib.models.misc import linear, normc_initializer
from ray.rllib.models.catalog import ModelCatalog
class A3CTFPolicyGraph(TFPolicyGraph):
"""The TF policy base class."""
class A3CLoss(object):
def __init__(
self, action_dist, actions, advantages, v_target, vf,
vf_loss_coeff=0.5, entropy_coeff=-0.01):
log_prob = action_dist.logp(actions)
def __init__(self, ob_space, action_space, config):
# The "policy gradients" loss
self.pi_loss = - tf.reduce_sum(log_prob * advantages)
delta = vf - v_target
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.entropy = tf.reduce_sum(action_dist.entropy())
self.total_loss = (self.pi_loss +
self.vf_loss * vf_loss_coeff +
self.entropy * entropy_coeff)
class A3CPolicyGraph(TFPolicyGraph):
def __init__(self, observation_space, action_space, config):
config = dict(ray.rllib.a3c.a3c.DEFAULT_CONFIG, **config)
self.local_steps = 0
self.config = config
self.summarize = config.get("summarize")
self._setup_graph(ob_space, action_space)
assert all(hasattr(self, attr)
for attr in ["vf", "logits", "x", "var_list"])
print("Setting up loss")
self.setup_loss(action_space)
self.is_training = tf.placeholder_with_default(True, ())
self.sess = tf.get_default_session()
TFPolicyGraph.__init__(
self, ob_space, action_space, self.sess, obs_input=self.x,
action_sampler=self.action_dist.sample(), loss=self.loss,
loss_inputs=self.loss_in, is_training=self.is_training,
state_inputs=self.state_in, state_outputs=self.state_out)
# Setup the policy
self.observations = tf.placeholder(
tf.float32, [None] + list(observation_space.shape))
dist_class, logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
self.model = ModelCatalog.get_model(
self.observations, logit_dim, self.config["model"])
action_dist = dist_class(self.model.outputs)
self.vf = tf.reshape(
linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
[-1])
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
is_training = tf.placeholder_with_default(True, ())
self.sess.run(tf.global_variables_initializer())
if self.summarize:
bs = tf.to_float(tf.shape(self.x)[0])
tf.summary.scalar("model/policy_graph", self.pi_loss / bs)
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
tf.summary.scalar("model/entropy", self.entropy / bs)
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
self.summary_op = tf.summary.merge_all()
def _setup_graph(self, ob_space, ac_space):
raise NotImplementedError
def setup_loss(self, action_space):
# Setup the policy loss
if isinstance(action_space, gym.spaces.Box):
ac_size = action_space.shape[0]
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
actions = tf.placeholder(tf.float32, [None, ac_size], name="ac")
elif isinstance(action_space, gym.spaces.Discrete):
self.ac = tf.placeholder(tf.int64, [None], name="ac")
actions = tf.placeholder(tf.int64, [None], name="ac")
else:
raise UnsupportedSpaceException(
"Action space {} is not supported for A3C.".format(
action_space))
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.r = tf.placeholder(tf.float32, [None], name="r")
advantages = tf.placeholder(tf.float32, [None], name="advantages")
v_target = tf.placeholder(tf.float32, [None], name="v_target")
self.loss = A3CLoss(
action_dist, actions, advantages, v_target, self.vf,
self.config["vf_loss_coeff"], self.config["entropy_coeff"])
log_prob = self.action_dist.logp(self.ac)
# Initialize TFPolicyGraph
loss_in = [
("obs", self.observations),
("actions", actions),
("advantages", advantages),
("value_targets", v_target),
]
for i, ph in enumerate(self.model.state_in):
loss_in.append(("state_in_{}".format(i), ph))
self.state_in = self.model.state_in
self.state_out = self.model.state_out
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.observations, action_sampler=action_dist.sample(),
loss=self.loss.total_loss, loss_inputs=loss_in,
is_training=is_training, state_inputs=self.state_in,
state_outputs=self.state_out)
# The "policy gradients" loss: its derivative is precisely the policy
# gradient. Notice that self.ac is a placeholder that is provided
# externally. adv will contain the advantages, as calculated in
# compute_advantages.
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
if self.config.get("summarize"):
bs = tf.to_float(tf.shape(self.observations)[0])
tf.summary.scalar("model/policy_graph", self.loss.pi_loss / bs)
tf.summary.scalar("model/value_loss", self.loss.vf_loss / bs)
tf.summary.scalar("model/entropy", self.loss.entropy / bs)
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
self.summary_op = tf.summary.merge_all()
delta = self.vf - self.r
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.entropy = tf.reduce_sum(self.action_dist.entropy())
self.loss = (self.pi_loss +
self.vf_loss * self.config["vf_loss_coeff"] +
self.entropy * self.config["entropy_coeff"])
self.sess.run(tf.global_variables_initializer())
def extra_compute_action_fetches(self):
return {"vf_preds": self.vf}
def value(self, ob, *args):
feed_dict = {self.observations: [ob]}
assert len(args) == len(self.state_in), (args, self.state_in)
for k, v in zip(self.state_in, args):
feed_dict[k] = v
vf = self.sess.run(self.vf, feed_dict)
return vf[0]
def optimizer(self):
return tf.train.AdamOptimizer(self.config["lr"])
def gradients(self, optimizer):
grads = tf.gradients(self.loss, self.var_list)
grads = tf.gradients(self.loss.total_loss, self.var_list)
self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
clipped_grads = list(zip(self.grads, self.var_list))
return clipped_grads
def extra_compute_grad_fetches(self):
if self.summarize:
if self.config.get("summarize"):
return {"summary": self.summary_op}
else:
return {}
def get_initial_state(self):
return self.model.state_init
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
completed = sample_batch["dones"][-1]
if completed:
+51 -87
View File
@@ -2,114 +2,78 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from threading import Lock
import torch
import torch.nn.functional as F
from torch import nn
import ray
from ray.rllib.models.pytorch.misc import var_to_np, convert_batch
from ray.rllib.models.pytorch.misc import var_to_np
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.policy_graph import PolicyGraph
from ray.rllib.utils.postprocessing import compute_advantages
from ray.rllib.utils.torch_policy_graph import TorchPolicyGraph
class SharedTorchPolicy(PolicyGraph):
"""A simple, non-recurrent PyTorch policy example."""
class A3CLoss(nn.Module):
def __init__(self, policy_model, vf_loss_coeff=0.5, entropy_coeff=-0.01):
nn.Module.__init__(self)
self.policy_model = policy_model
self.vf_loss_coeff = vf_loss_coeff
self.entropy_coeff = entropy_coeff
def __init__(self, obs_space, action_space, config):
config = dict(ray.rllib.a3c.a3c.DEFAULT_CONFIG, **config)
PolicyGraph.__init__(self, obs_space, action_space, config)
self.local_steps = 0
self.config = config
self.summarize = config.get("summarize")
self.setup_graph(obs_space, action_space)
torch.set_num_threads(2)
self.lock = Lock()
def setup_graph(self, obs_space, action_space):
_, self.logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
self._model = ModelCatalog.get_torch_model(
obs_space.shape, self.logit_dim, self.config["model"])
self.optimizer = torch.optim.Adam(
self._model.parameters(), lr=self.config["lr"])
def compute_actions(self, obs, state, is_training=False):
assert not state, "RNN not supported"
with self.lock:
ob = torch.from_numpy(np.array(obs)).float()
logits, values = self._model(ob)
samples = F.softmax(logits, dim=1).multinomial(1).squeeze(0)
return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
def compute_gradients(self, samples):
with self.lock:
self.backward(samples)
# Note that return values are just references;
# calling zero_grad will modify the values
return [p.grad.data.numpy() for p in self._model.parameters()], {}
def apply_gradients(self, grads):
self.optimizer.zero_grad()
for g, p in zip(grads, self._model.parameters()):
p.grad = torch.from_numpy(g)
self.optimizer.step()
return {}
def get_weights(self):
# !! This only returns references to the data.
return self._model.state_dict()
def set_weights(self, weights):
with self.lock:
self._model.load_state_dict(weights)
def value(self, obs):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0)
res = self._model.hidden_layers(obs)
res = self._model.value_branch(res)
res = res.squeeze()
return var_to_np(res)
def forward(self, obs_batch, actions):
logits, values = self._model(obs_batch)
def forward(self, observations, actions, advantages, value_targets):
logits, values = self.policy_model(observations)
log_probs = F.log_softmax(logits, dim=1)
probs = F.softmax(logits, dim=1)
action_log_probs = log_probs.gather(1, actions.view(-1, 1))
entropy = -(log_probs * probs).sum(-1).sum()
return values, action_log_probs, entropy
def backward(self, sample_batch):
"""Loss is encoded here.
Defining a new loss function would start by rewriting this function.
"""
states, actions, advs, rs = convert_batch(sample_batch)
values, action_log_probs, entropy = self.forward(states, actions)
pi_err = -advs.dot(action_log_probs.reshape(-1))
value_err = F.mse_loss(values.reshape(-1), rs)
self.optimizer.zero_grad()
pi_err = -advantages.dot(action_log_probs.reshape(-1))
value_err = F.mse_loss(values.reshape(-1), value_targets)
overall_err = sum([
pi_err,
self.config["vf_loss_coeff"] * value_err,
self.config["entropy_coeff"] * entropy,
self.vf_loss_coeff * value_err,
self.entropy_coeff * entropy,
])
return overall_err
overall_err.backward()
torch.nn.utils.clip_grad_norm_(self._model.parameters(),
self.config["grad_clip"])
class A3CTorchPolicyGraph(TorchPolicyGraph):
"""A simple, non-recurrent PyTorch policy example."""
def __init__(self, obs_space, action_space, config):
config = dict(ray.rllib.a3c.a3c.DEFAULT_CONFIG, **config)
self.config = config
_, self.logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
self.model = ModelCatalog.get_torch_model(
obs_space.shape, self.logit_dim, self.config["model"])
loss = A3CLoss(
self.model, self.config["vf_loss_coeff"],
self.config["entropy_coeff"])
TorchPolicyGraph.__init__(
self, obs_space, action_space, self.model, loss,
loss_inputs=[
"obs", "actions", "advantages", "value_targets"])
def extra_action_out(self, model_out):
return {"vf_preds": var_to_np(model_out[1])}
def optimizer(self):
return torch.optim.Adam(
self.model.parameters(), lr=self.config["lr"])
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
last_r = self.value(sample_batch["new_obs"][-1])
last_r = self._value(sample_batch["new_obs"][-1])
return compute_advantages(
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
def _value(self, obs):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0)
res = self.model.hidden_layers(obs)
res = self.model.value_branch(res)
res = res.squeeze()
return var_to_np(res)
-16
View File
@@ -1,16 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def get_policy_cls(config):
if config["use_lstm"]:
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
policy_cls = SharedModelLSTM
elif config["use_pytorch"]:
from ray.rllib.a3c.a3c_torch_policy import SharedTorchPolicy
policy_cls = SharedTorchPolicy
else:
from ray.rllib.a3c.shared_model import SharedModel
policy_cls = SharedModel
return policy_cls
-53
View File
@@ -1,53 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.misc import linear, normc_initializer
from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
from ray.rllib.models.catalog import ModelCatalog
class SharedModel(A3CTFPolicyGraph):
def __init__(self, ob_space, ac_space, config, **kwargs):
super(SharedModel, self).__init__(
ob_space, ac_space, config, **kwargs)
def _setup_graph(self, ob_space, ac_space):
self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
ac_space, self.config["model"])
self._model = ModelCatalog.get_model(
self.x, self.logit_dim, self.config["model"])
self.logits = self._model.outputs
self.action_dist = dist_class(self.logits)
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
normc_initializer(1.0)), [-1])
self.sample = self.action_dist.sample()
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
self.global_step = tf.get_variable(
"global_step", [], tf.int32,
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
self.state_in = []
self.state_out = []
def setup_loss(self, action_space):
A3CTFPolicyGraph.setup_loss(self, action_space)
self.loss_in = [
("obs", self.x),
("actions", self.ac),
("advantages", self.adv),
("value_targets", self.r),
]
def extra_compute_action_fetches(self):
return {"vf_preds": self.vf}
def value(self, ob, *args):
vf = self.sess.run(self.vf, {self.x: [ob]})
return vf[0]
-63
View File
@@ -1,63 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.misc import linear, normc_initializer
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
from ray.rllib.models.lstm import LSTM
class SharedModelLSTM(A3CTFPolicyGraph):
def __init__(self, ob_space, ac_space, config, **kwargs):
super(SharedModelLSTM, self).__init__(
ob_space, ac_space, config, **kwargs)
def _setup_graph(self, ob_space, ac_space):
self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
ac_space, self.config["model"])
self._model = LSTM(self.x, self.logit_dim, {})
self.state_in = self._model.state_in
self.state_out = self._model.state_out
self.logits = self._model.outputs
self.action_dist = dist_class(self.logits)
# with tf.variable_scope("vf"):
# vf_model = ModelCatalog.get_model(self.x, 1)
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
normc_initializer(1.0)), [-1])
self.sample = self.action_dist.sample()
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
self.global_step = tf.get_variable(
"global_step", [], tf.int32,
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
def get_initial_state(self):
return self._model.state_init
def setup_loss(self, action_space):
A3CTFPolicyGraph.setup_loss(self, action_space)
self.loss_in = [
("obs", self.x),
("actions", self.ac),
("advantages", self.adv),
("value_targets", self.r),
("state_in_0", self.state_in[0]),
("state_in_1", self.state_in[1]),
]
def extra_compute_action_fetches(self):
return {"vf_preds": self.vf}
def value(self, ob, c, h):
vf = self.sess.run(self.vf, {self.x: [ob],
self.state_in[0]: c,
self.state_in[1]: h})
return vf[0]
+117 -93
View File
@@ -22,62 +22,88 @@ Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
def _build_p_network(inputs, dim_actions, config):
"""
map an observation (i.e., state) to an action where
each entry takes value from (0, 1) due to the sigmoid function
"""
frontend = ModelCatalog.get_model(inputs, 1, config["model"])
class PNetwork(object):
"""Maps an observations (i.e., state) to an action where each entry takes
value from (0, 1) due to the sigmoid function."""
hiddens = config["actor_hiddens"]
action_out = frontend.last_layer
for hidden in hiddens:
action_out = layers.fully_connected(
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
# Use sigmoid layer to bound values within (0, 1)
# shape of action_scores is [batch_size, dim_actions]
action_scores = layers.fully_connected(
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
return action_scores
def __init__(self, model, dim_actions, hiddens=[64, 64]):
action_out = model.last_layer
for hidden in hiddens:
action_out = layers.fully_connected(
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
# Use sigmoid layer to bound values within (0, 1)
# shape of action_scores is [batch_size, dim_actions]
self.action_scores = layers.fully_connected(
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
# As a stochastic policy for inference, but a deterministic policy for training
# thus ignore batch_size issue when constructing a stochastic action
def _build_action_network(p_values, low_action, high_action, stochastic, eps,
theta, sigma):
# shape is [None, dim_action]
deterministic_actions = (high_action - low_action) * p_values + low_action
class ActionNetwork(object):
"""Acts as a stochastic policy for inference, but a deterministic policy
for training, thus ignoring the batch_size issue when constructing a
stochastic action."""
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=low_action.size * [.0],
trainable=False)
normal_sample = tf.random_normal(
shape=[low_action.size], mean=0.0, stddev=1.0)
exploration_value = tf.assign_add(
exploration_sample,
theta * (.0 - exploration_sample) + sigma * normal_sample)
stochastic_actions = deterministic_actions + eps * (
high_action - low_action) * exploration_value
def __init__(
self, p_values, low_action, high_action, stochastic, eps,
theta=0.15, sigma=0.2):
return tf.cond(stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
# shape is [None, dim_action]
deterministic_actions = (
(high_action - low_action) * p_values + low_action)
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=low_action.size * [.0],
trainable=False)
normal_sample = tf.random_normal(
shape=[low_action.size], mean=0.0, stddev=1.0)
exploration_value = tf.assign_add(
exploration_sample,
theta * (.0 - exploration_sample) + sigma * normal_sample)
stochastic_actions = deterministic_actions + eps * (
high_action - low_action) * exploration_value
self.actions = tf.cond(
stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
def _build_q_network(inputs, action_inputs, config):
frontend = ModelCatalog.get_model(inputs, 1, config["model"])
class QNetwork(object):
def __init__(self, model, action_inputs, hiddens=[64, 64]):
q_out = tf.concat([model.last_layer, action_inputs], axis=1)
for hidden in hiddens:
q_out = layers.fully_connected(
q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
self.value = layers.fully_connected(
q_out, num_outputs=1, activation_fn=None)
hiddens = config["critic_hiddens"]
q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
for hidden in hiddens:
q_out = layers.fully_connected(
q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
class ActorCriticLoss(object):
def __init__(
self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask,
gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0):
return q_scores
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
q_tp1_best = tf.squeeze(
input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
if use_huber:
errors = _huber_loss(self.td_error, huber_threshold)
else:
errors = 0.5 * tf.square(self.td_error)
self.critic_loss = tf.reduce_mean(importance_weights * errors)
# for policy gradient
self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
self.total_loss = self.actor_loss + self.critic_loss
class DDPGPolicyGraph(TFPolicyGraph):
@@ -98,6 +124,28 @@ class DDPGPolicyGraph(TFPolicyGraph):
self.critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["critic_lr"])
def _build_q_network(obs, actions):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]),
actions,
config["critic_hiddens"]).value
def _build_p_network(obs):
return PNetwork(
ModelCatalog.get_model(obs, 1, config["model"]),
dim_actions,
config["actor_hiddens"]).action_scores
def _build_action_network(p_values, stochastic, eps):
return ActionNetwork(
p_values,
low_action,
high_action,
stochastic,
eps,
config["exploration_theta"],
config["exploration_sigma"]).actions
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
@@ -106,15 +154,13 @@ class DDPGPolicyGraph(TFPolicyGraph):
# Actor: P (policy) network
with tf.variable_scope(P_SCOPE) as scope:
p_values = _build_p_network(self.cur_observations,
dim_actions, config)
p_values = _build_p_network(self.cur_observations)
self.p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE):
self.output_actions = _build_action_network(
p_values, low_action, high_action, self.stochastic, self.eps,
config["exploration_theta"], config["exploration_sigma"])
p_values, self.stochastic, self.eps)
with tf.variable_scope(A_SCOPE, reuse=True):
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
@@ -137,11 +183,11 @@ class DDPGPolicyGraph(TFPolicyGraph):
# p network evaluation
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
self.p_t = _build_p_network(self.obs_t, dim_actions, config)
self.p_t = _build_p_network(self.obs_t)
# target p network evaluation
with tf.variable_scope(P_TARGET_SCOPE) as scope:
p_tp1 = _build_p_network(self.obs_tp1, dim_actions, config)
p_tp1 = _build_p_network(self.obs_tp1)
target_p_func_vars = _scope_vars(scope.name)
# Action outputs
@@ -149,59 +195,37 @@ class DDPGPolicyGraph(TFPolicyGraph):
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
zero_eps = tf.constant(value=.0, dtype=tf.float32)
output_actions = _build_action_network(
self.p_t, low_action, high_action, deterministic_flag,
zero_eps, config["exploration_theta"],
config["exploration_sigma"])
self.p_t, deterministic_flag, zero_eps)
output_actions_estimated = _build_action_network(
p_tp1, low_action, high_action, deterministic_flag,
zero_eps, config["exploration_theta"],
config["exploration_sigma"])
p_tp1, deterministic_flag, zero_eps)
# q network evaluation
with tf.variable_scope(Q_SCOPE) as scope:
q_t = _build_q_network(self.obs_t, self.act_t, config)
q_t = _build_q_network(self.obs_t, self.act_t)
self.q_func_vars = _scope_vars(scope.name)
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp0 = _build_q_network(self.obs_t, output_actions, config)
q_tp0 = _build_q_network(self.obs_t, output_actions)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(
self.obs_tp1, output_actions_estimated, config)
q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated)
target_q_func_vars = _scope_vars(scope.name)
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
q_tp1_best = tf.squeeze(
input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (
self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
if config.get("use_huber"):
errors = _huber_loss(self.td_error, config.get("huber_threshold"))
else:
errors = 0.5 * tf.square(self.td_error)
self.loss = tf.reduce_mean(self.importance_weights * errors)
# for policy gradient
self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
self.loss = ActorCriticLoss(
q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t,
self.done_mask, config["gamma"], config["n_step"],
config["use_huber"], config["huber_threshold"])
if config["l2_reg"] is not None:
for var in self.p_func_vars:
if "bias" not in var.name:
self.actor_loss += (
self.loss.actor_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
for var in self.q_func_vars:
if "bias" not in var.name:
self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
var)
self.loss.critic_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
# update_target_fn will be called periodically to copy Q network to
# target Q network
@@ -235,7 +259,7 @@ class DDPGPolicyGraph(TFPolicyGraph):
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss,
action_sampler=self.output_actions, loss=self.loss.total_loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
@@ -251,19 +275,19 @@ class DDPGPolicyGraph(TFPolicyGraph):
if self.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = _minimize_and_clip(
self.actor_optimizer,
self.actor_loss,
self.loss.actor_loss,
var_list=self.p_func_vars,
clip_val=self.config["grad_norm_clipping"])
critic_grads_and_vars = _minimize_and_clip(
self.critic_optimizer,
self.loss,
self.loss.critic_loss,
var_list=self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
actor_grads_and_vars = self.actor_optimizer.compute_gradients(
self.actor_loss, var_list=self.p_func_vars)
self.loss.actor_loss, var_list=self.p_func_vars)
critic_grads_and_vars = self.critic_optimizer.compute_gradients(
self.loss, var_list=self.q_func_vars)
self.loss.critic_loss, var_list=self.q_func_vars)
actor_grads_and_vars = [
(g, v) for (g, v) in actor_grads_and_vars if g is not None]
critic_grads_and_vars = [
@@ -279,7 +303,7 @@ class DDPGPolicyGraph(TFPolicyGraph):
def extra_compute_grad_fetches(self):
return {
"td_error": self.td_error,
"td_error": self.loss.td_error,
}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
@@ -288,7 +312,7 @@ class DDPGPolicyGraph(TFPolicyGraph):
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err = self.sess.run(
self.td_error,
self.loss.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
+218 -208
View File
@@ -18,6 +18,224 @@ Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
class QNetwork(object):
def __init__(self, model, num_actions, dueling=False, hiddens=[256]):
with tf.variable_scope("action_value"):
action_out = model.last_layer
for hidden in hiddens:
action_out = layers.fully_connected(
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
action_scores = layers.fully_connected(
action_out, num_outputs=num_actions, activation_fn=None)
if dueling:
with tf.variable_scope("state_value"):
state_out = model.last_layer
for hidden in hiddens:
state_out = layers.fully_connected(
state_out, num_outputs=hidden,
activation_fn=tf.nn.relu)
state_score = layers.fully_connected(
state_out, num_outputs=1, activation_fn=None)
action_scores_mean = tf.reduce_mean(action_scores, 1)
action_scores_centered = action_scores - tf.expand_dims(
action_scores_mean, 1)
self.value = state_score + action_scores_centered
else:
self.value = action_scores
class QValuePolicy(object):
def __init__(self, q_values, observations, num_actions, stochastic, eps):
deterministic_actions = tf.argmax(q_values, axis=1)
batch_size = tf.shape(observations)[0]
random_actions = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=num_actions,
dtype=tf.int64)
chose_random = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.where(
chose_random, random_actions, deterministic_actions)
self.action = tf.cond(
stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
class QLoss(object):
def __init__(
self, q_t_selected, q_tp1_best, importance_weights, rewards,
done_mask, gamma=0.99, n_step=1):
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = rewards + gamma ** n_step * q_tp1_best_masked
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
self.loss = tf.reduce_mean(
importance_weights * _huber_loss(self.td_error))
class DQNPolicyGraph(TFPolicyGraph):
def __init__(self, observation_space, action_space, config):
config = dict(ray.rllib.dqn.dqn.DEFAULT_CONFIG, **config)
if not isinstance(action_space, Discrete):
raise UnsupportedSpaceException(
"Action space {} is not supported for DQN.".format(
action_space))
self.config = config
self.cur_epsilon = 1.0
num_actions = action_space.n
def _build_q_network(obs):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]),
num_actions, config["dueling"], config["hiddens"]).value
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
# Action Q network
with tf.variable_scope(Q_SCOPE) as scope:
q_values = _build_q_network(self.cur_observations)
self.q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = QValuePolicy(
q_values,
self.cur_observations,
num_actions,
self.stochastic,
self.eps).action
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
self.act_t = tf.placeholder(tf.int32, [None], name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
# q network evaluation
with tf.variable_scope(Q_SCOPE, reuse=True):
q_t = _build_q_network(self.obs_t)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(self.obs_tp1)
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(
q_t * tf.one_hot(self.act_t, num_actions), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
q_tp1 * tf.one_hot(
q_tp1_best_using_online_net, num_actions), 1)
else:
q_tp1_best = tf.reduce_max(q_tp1, 1)
self.loss = QLoss(
q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, config["gamma"], config["n_step"])
# update_target_fn will be called periodically to copy Q network to
# target Q network
update_target_expr = []
for var, var_target in zip(
sorted(self.q_func_vars, key=lambda v: v.name),
sorted(self.target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(var_target.assign(var))
self.update_target_expr = tf.group(*update_target_expr)
# initialize TFPolicyGraph
self.sess = tf.get_default_session()
self.loss_inputs = [
("obs", self.obs_t),
("actions", self.act_t),
("rewards", self.rew_t),
("new_obs", self.obs_tp1),
("dones", self.done_mask),
("weights", self.importance_weights),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss.loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
grads_and_vars = _minimize_and_clip(
optimizer, self.loss.loss, var_list=self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
grads_and_vars = optimizer.compute_gradients(
self.loss.loss, var_list=self.q_func_vars)
grads_and_vars = [
(g, v) for (g, v) in grads_and_vars if g is not None]
return grads_and_vars
def extra_compute_action_feed_dict(self):
return {
self.stochastic: True,
self.eps: self.cur_epsilon,
}
def extra_compute_grad_fetches(self):
return {
"td_error": self.loss.td_error,
}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return _postprocess_dqn(self, sample_batch)
def compute_td_error(
self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
td_err = self.sess.run(
self.loss.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def update_target(self):
return self.sess.run(self.update_target_expr)
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
def get_state(self):
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
def set_state(self, state):
TFPolicyGraph.set_state(self, state[0])
self.set_epsilon(state[1])
def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
"""Rewrites the given trajectory fragments to encode n-step rewards.
@@ -46,169 +264,6 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
del arr[new_len:]
class DQNPolicyGraph(TFPolicyGraph):
def __init__(self, observation_space, action_space, config):
config = dict(ray.rllib.dqn.dqn.DEFAULT_CONFIG, **config)
if not isinstance(action_space, Discrete):
raise UnsupportedSpaceException(
"Action space {} is not supported for DQN.".format(
action_space))
self.config = config
self.cur_epsilon = 1.0
num_actions = action_space.n
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
# Action Q network
with tf.variable_scope(Q_SCOPE) as scope:
q_values = _build_q_network(
self.cur_observations, num_actions, config)
self.q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = _build_action_network(
q_values,
self.cur_observations,
num_actions,
self.stochastic,
self.eps)
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
self.act_t = tf.placeholder(tf.int32, [None], name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
# q network evaluation
with tf.variable_scope(Q_SCOPE, reuse=True):
q_t = _build_q_network(self.obs_t, num_actions, config)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(self.obs_tp1, num_actions, config)
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(
q_t * tf.one_hot(self.act_t, num_actions), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp1_using_online_net = _build_q_network(
self.obs_tp1, num_actions, config)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
q_tp1 * tf.one_hot(
q_tp1_best_using_online_net, num_actions), 1)
else:
q_tp1_best = tf.reduce_max(q_tp1, 1)
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (
self.rew_t +
config["gamma"] ** config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
self.loss = tf.reduce_mean(
self.importance_weights * _huber_loss(self.td_error))
# update_target_fn will be called periodically to copy Q network to
# target Q network
update_target_expr = []
for var, var_target in zip(
sorted(self.q_func_vars, key=lambda v: v.name),
sorted(self.target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(var_target.assign(var))
self.update_target_expr = tf.group(*update_target_expr)
# initialize TFPolicyGraph
self.sess = tf.get_default_session()
self.loss_inputs = [
("obs", self.obs_t),
("actions", self.act_t),
("rewards", self.rew_t),
("new_obs", self.obs_tp1),
("dones", self.done_mask),
("weights", self.importance_weights),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
grads_and_vars = _minimize_and_clip(
optimizer, self.loss, var_list=self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
grads_and_vars = optimizer.compute_gradients(
self.loss, var_list=self.q_func_vars)
grads_and_vars = [
(g, v) for (g, v) in grads_and_vars if g is not None]
return grads_and_vars
def extra_compute_action_feed_dict(self):
return {
self.stochastic: True,
self.eps: self.cur_epsilon,
}
def extra_compute_grad_fetches(self):
return {
"td_error": self.td_error,
}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return _postprocess_dqn(self, sample_batch)
def compute_td_error(
self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
td_err = self.sess.run(
self.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def update_target(self):
return self.sess.run(self.update_target_expr)
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
def get_state(self):
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
def set_state(self, state):
TFPolicyGraph.set_state(self, state[0])
self.set_epsilon(state[1])
def _postprocess_dqn(policy_graph, sample_batch):
obs, actions, rewards, new_obs, dones = [
list(x) for x in sample_batch.columns(
@@ -237,51 +292,6 @@ def _postprocess_dqn(policy_graph, sample_batch):
return batch
def _build_q_network(inputs, num_actions, config):
dueling = config["dueling"]
hiddens = config["hiddens"]
frontend = ModelCatalog.get_model(inputs, 1, config["model"])
frontend_out = frontend.last_layer
with tf.variable_scope("action_value"):
action_out = frontend_out
for hidden in hiddens:
action_out = layers.fully_connected(
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
action_scores = layers.fully_connected(
action_out, num_outputs=num_actions, activation_fn=None)
if dueling:
with tf.variable_scope("state_value"):
state_out = frontend_out
for hidden in hiddens:
state_out = layers.fully_connected(
state_out, num_outputs=hidden, activation_fn=tf.nn.relu)
state_score = layers.fully_connected(
state_out, num_outputs=1, activation_fn=None)
action_scores_mean = tf.reduce_mean(action_scores, 1)
action_scores_centered = action_scores - tf.expand_dims(
action_scores_mean, 1)
return state_score + action_scores_centered
else:
return action_scores
def _build_action_network(
q_values, observations, num_actions, stochastic, eps):
deterministic_actions = tf.argmax(q_values, axis=1)
batch_size = tf.shape(observations)[0]
random_actions = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=num_actions, dtype=tf.int64)
chose_random = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.where(
chose_random, random_actions, deterministic_actions)
return tf.cond(
stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
def _huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
+1 -3
View File
@@ -3,12 +3,10 @@ from ray.rllib.models.action_dist import (ActionDistribution, Categorical,
DiagGaussian, Deterministic)
from ray.rllib.models.model import Model
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.convnet import ConvolutionalNetwork
from ray.rllib.models.lstm import LSTM
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
__all__ = ["ActionDistribution", "ActionDistribution", "Categorical",
"DiagGaussian", "Deterministic", "ModelCatalog", "Model",
"FullyConnectedNetwork", "ConvolutionalNetwork", "LSTM",
"MultiAgentFullyConnectedNetwork"]
"FullyConnectedNetwork", "LSTM", "MultiAgentFullyConnectedNetwork"]
+5
View File
@@ -16,6 +16,7 @@ from ray.rllib.models.action_dist import (
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.visionnet import VisionNetwork
from ray.rllib.models.lstm import LSTM
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
@@ -31,6 +32,7 @@ MODEL_CONFIGS = [
"free_log_std", # Documented in ray.rllib.models.Model
"channel_major", # Pytorch conv requires images to be channel-major
"squash_to_range", # Whether to squash the action output to space range
"use_lstm", # Whether to use a LSTM model
# === Options for custom models ===
"custom_preprocessor", # Name of a custom preprocessor to use
@@ -148,6 +150,9 @@ class ModelCatalog(object):
return _global_registry.get(RLLIB_MODEL, model)(
inputs, num_outputs, options)
if options.get("use_lstm"):
return LSTM(inputs, num_outputs, options)
obs_rank = len(inputs.shape) - 1
# num_outputs > 1 used to avoid hitting this with the value function
-23
View File
@@ -1,23 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.model import Model
from ray.rllib.models.misc import normc_initializer, conv2d, linear
class ConvolutionalNetwork(Model):
"""Generic convolutional network."""
# TODO(rliaw): converge on one generic ConvNet model
def _init(self, inputs, num_outputs, options):
x = inputs
with tf.name_scope("convnet"):
for i in range(4):
x = tf.nn.elu(conv2d(x, 32, "l{}".format(i+1), [3, 3], [2, 2]))
r, c = x.shape[1].value, x.shape[2].value
x = tf.reshape(x, [-1, r*c*32])
fc1 = linear(x, 256, "fc1")
fc2 = linear(x, num_outputs, "fc2", normc_initializer(0.01))
return fc2, fc1
-49
View File
@@ -1,49 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import tensorflow.contrib.slim as slim
from ray.rllib.models.model import Model
class DDPGActor(Model):
"""Actor network for DDPG."""
def _init(self, inputs, num_outputs, options):
w_normal = tf.truncated_normal_initializer()
w_init = tf.random_uniform_initializer(minval=-0.003, maxval=0.003)
ac_bound = options["action_bound"]
net = slim.fully_connected(
inputs, 400, activation_fn=tf.nn.relu,
weights_initializer=w_normal)
net = slim.fully_connected(
net, 300, activation_fn=tf.nn.relu, weights_initializer=w_normal)
out = slim.fully_connected(
net, num_outputs, activation_fn=tf.nn.tanh,
weights_initializer=w_init)
scaled_out = tf.multiply(out, ac_bound)
return scaled_out, net
class DDPGCritic(Model):
"""Critic network for DDPG."""
def _init(self, inputs, num_outputs, options):
obs, action = inputs
w_normal = tf.truncated_normal_initializer()
w_init = tf.random_uniform_initializer(minval=-0.0003, maxval=0.0003)
net = slim.fully_connected(
obs, 400, activation_fn=tf.nn.relu, weights_initializer=w_normal)
t1 = slim.fully_connected(
net, 300, activation_fn=None, biases_initializer=None,
weights_initializer=w_normal)
t2 = slim.fully_connected(
action, 300, activation_fn=None, weights_initializer=w_normal)
net = tf.nn.relu(tf.add(t1, t2))
out = slim.fully_connected(
net, 1, activation_fn=None, weights_initializer=w_init)
return out, net
+6
View File
@@ -27,9 +27,15 @@ class Model(object):
inputs (Tensor): The input placeholder for this model.
outputs (Tensor): The output vector of this model.
last_layer (Tensor): The network layer right before the model output.
state_init (list): List of initial recurrent state tensors (if any).
state_in (list): List of input recurrent state tensors (if any).
state_out (list): List of output recurrent state tensors (if any).
"""
def __init__(self, inputs, num_outputs, options):
self.state_init = []
self.state_in = []
self.state_out = []
self.inputs = inputs
if options.get("free_log_std", False):
assert num_outputs % 2 == 0
+1 -11
View File
@@ -7,18 +7,8 @@ import numpy as np
import torch
def convert_batch(trajectory):
"""Convert trajectory from numpy to PT variable"""
states = torch.from_numpy(trajectory["obs"]).float()
acs = torch.from_numpy(trajectory["actions"])
advs = torch.from_numpy(
trajectory["advantages"].copy()).float().reshape(-1)
rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
return states, acs, advs, rs
def var_to_np(var):
return var.detach().numpy()
return var.cpu().detach().numpy()
def normc_initializer(std=1.0):
+9 -2
View File
@@ -5,11 +5,17 @@ from __future__ import print_function
import collections
import numpy as np
# Defaults policy id for single agent environments
DEFAULT_POLICY_ID = "default"
def to_float_array(v):
arr = np.array(v)
if arr.dtype == np.float64:
return arr.astype(np.float32) # save some memory
return arr
class SampleBatchBuilder(object):
"""Util to build a SampleBatch incrementally.
@@ -38,7 +44,8 @@ class SampleBatchBuilder(object):
def build_and_reset(self):
"""Returns a sample batch including all previously added values."""
batch = SampleBatch({k: np.array(v) for k, v in self.buffers.items()})
batch = SampleBatch(
{k: to_float_array(v) for k, v in self.buffers.items()})
self.buffers.clear()
self.count = 0
return batch
+25 -21
View File
@@ -6,42 +6,46 @@ import tensorflow as tf
import ray
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.postprocessing import compute_advantages
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
class PGPolicyGraph(TFPolicyGraph):
class PGLoss(object):
def __init__(self, action_dist, actions, advantages):
self.loss = -tf.reduce_mean(action_dist.logp(actions) * advantages)
class PGPolicyGraph(TFPolicyGraph):
def __init__(self, obs_space, action_space, config):
config = dict(ray.rllib.pg.pg.DEFAULT_CONFIG, **config)
self.config = config
# setup policy
self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
# Setup policy
obs = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
self.model = ModelCatalog.get_model(
self.x, self.logit_dim, options=self.config["model"])
self.dist = dist_class(self.model.outputs) # logit for each action
model = ModelCatalog.get_model(
obs, self.logit_dim, options=self.config["model"])
action_dist = dist_class(model.outputs) # logit for each action
# setup policy loss
self.ac = ModelCatalog.get_action_placeholder(action_space)
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv)
# Setup policy loss
actions = ModelCatalog.get_action_placeholder(action_space)
advantages = tf.placeholder(tf.float32, [None], name="adv")
loss = PGLoss(action_dist, actions, advantages).loss
# initialize TFPolicyGraph
self.sess = tf.get_default_session()
self.loss_in = [
("obs", self.x),
("actions", self.ac),
("advantages", self.adv),
# Initialize TFPolicyGraph
sess = tf.get_default_session()
loss_in = [
("obs", obs),
("actions", actions),
("advantages", advantages),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, obs_space, action_space, self.sess, obs_input=self.x,
action_sampler=self.dist.sample(), loss=self.loss,
loss_inputs=self.loss_in, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
self, obs_space, action_space, sess, obs_input=obs,
action_sampler=action_dist.sample(), loss=loss,
loss_inputs=loss_in, is_training=self.is_training)
sess.run(tf.global_variables_initializer())
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return compute_advantages(
+1 -1
View File
@@ -11,7 +11,7 @@ from ray.rllib.optimizers import SampleBatch, TFMultiGPUSupport
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.sampler import SyncSampler
from ray.rllib.utils.filter import get_filter, MeanStdFilter
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.postprocessing import compute_advantages
from ray.rllib.ppo.loss import ProximalPolicyGraph
@@ -25,7 +25,7 @@ CONFIGS = {
"DQN": {},
"DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
"A3C": {"use_lstm": False, "num_workers": 1},
"A3C": {"num_workers": 1},
}
@@ -11,7 +11,7 @@ from ray.rllib.pg import PGAgent
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
collect_metrics
from ray.rllib.utils.policy_graph import PolicyGraph
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.postprocessing import compute_advantages
from ray.rllib.utils.vector_env import VectorEnv
from ray.tune.registry import register_env
@@ -42,21 +42,6 @@ halfcheetah-ddpg:
learning_starts: 500
sample_batch_size: 1
train_batch_size: 64
smoothing_num_episodes: 10
# === Tensorflow ===
tf_session_args: {
"device_count": {
"CPU": 2
},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
}
# === Parallelism ===
num_workers: 0
@@ -42,21 +42,6 @@ mountaincarcontinuous-ddpg:
learning_starts: 1000
sample_batch_size: 1
train_batch_size: 64
smoothing_num_episodes: 10
# === Tensorflow ===
tf_session_args: {
"device_count": {
"CPU": 2
},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
}
# === Parallelism ===
num_workers: 0
@@ -42,21 +42,6 @@ pendulum-ddpg:
learning_starts: 500
sample_batch_size: 1
train_batch_size: 64
smoothing_num_episodes: 10
# === Tensorflow ===
tf_session_args: {
"device_count": {
"CPU": 2
},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
}
# === Parallelism ===
num_workers: 0
@@ -4,7 +4,6 @@ pong-a3c-pytorch-cnn:
config:
num_workers: 16
batch_size: 20
use_lstm: false
use_pytorch: true
vf_loss_coeff: 0.5
entropy_coeff: -0.01
@@ -15,6 +14,7 @@ pong-a3c-pytorch-cnn:
observation_filter: NoFilter
reward_filter: NoFilter
model:
use_lstm: false
channel_major: true
dim: 80
grayscale: true
@@ -2,9 +2,8 @@ pong-a3c:
env: PongDeterministic-v4
run: A3C
config:
num_workers: 16
num_workers: 1
batch_size: 20
use_lstm: true
use_pytorch: false
vf_loss_coeff: 0.5
entropy_coeff: -0.01
@@ -15,6 +14,7 @@ pong-a3c:
observation_filter: NoFilter
reward_filter: NoFilter
model:
use_lstm: true
channel_major: false
dim: 42
grayscale: true
+6 -2
View File
@@ -24,9 +24,13 @@ class PolicyGraph(object):
def __init__(self, observation_space, action_space, config):
"""Initialize the graph.
This is the standard constructor for policy graphs. The policy graph
class you pass into CommonPolicyEvaluator will be constructed with
these arguments.
Args:
observation_space (gym.Space): Observation space of the env.
action_space (gym.Space): Action space of the env.
observation_space (gym.Space): Observation space of the policy.
action_space (gym.Space): Action space of the policy.
config (dict): Policy-specific configuration data.
"""
@@ -23,7 +23,8 @@ def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
Returns:
SampleBatch (SampleBatch): Object with experience from rollout and
processed rewards."""
processed rewards.
"""
traj = {}
trajsize = len(rollout["actions"])
@@ -37,13 +38,14 @@ def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
# This formula for the advantage comes
# "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
traj["advantages"] = discount(delta_t, gamma * lambda_)
traj["value_targets"] = traj["advantages"] + traj["vf_preds"]
traj["value_targets"] = (
traj["advantages"] + traj["vf_preds"]).copy().astype(np.float32)
else:
rewards_plus_v = np.concatenate(
[rollout["rewards"], np.array([last_r])])
traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]
traj["advantages"] = traj["advantages"].copy()
traj["advantages"] = traj["advantages"].copy().astype(np.float32)
assert all(val.shape[0] == trajsize for val in traj.values()), \
"Rollout stacked incorrectly!"
+1 -1
View File
@@ -219,7 +219,7 @@ def _env_runner(
else:
all_done = False
# At least send an empty dict if not done
actions_to_send[env_id]
actions_to_send[env_id] = {}
# For each agent in the environment
for agent_id, raw_obs in agent_obs.items():
+7 -2
View File
@@ -18,6 +18,10 @@ class TFPolicyGraph(PolicyGraph):
All input and output tensors are of shape [BATCH_DIM, ...].
Attributes:
observation_space (gym.Space): observation space of the policy.
action_space (gym.Space): action space of the policy.
Examples:
>>> policy = TFPolicyGraphSubclass(
sess, obs_input, action_sampler, loss, loss_inputs, is_training)
@@ -33,7 +37,7 @@ class TFPolicyGraph(PolicyGraph):
self, observation_space, action_space, sess, obs_input,
action_sampler, loss, loss_inputs,
is_training, state_inputs=None, state_outputs=None):
"""Initialize the policy.
"""Initialize the policy graph.
Arguments:
observation_space (gym.Space): Observation space of the env.
@@ -71,7 +75,8 @@ class TFPolicyGraph(PolicyGraph):
self._loss, self._sess)
assert len(self._state_inputs) == len(self._state_outputs) == \
len(self.get_initial_state())
len(self.get_initial_state()), \
(self._state_inputs, self._state_outputs, self.get_initial_state())
def build_compute_actions(
self, builder, obs_batch, state_batches=None, is_training=False):
@@ -0,0 +1,104 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from threading import Lock
import torch
import torch.nn.functional as F
from ray.rllib.models.pytorch.misc import var_to_np
from ray.rllib.utils.policy_graph import PolicyGraph
class TorchPolicyGraph(PolicyGraph):
"""Template for a PyTorch policy and loss to use with RLlib.
This is similar to TFPolicyGraph, but for PyTorch.
Attributes:
observation_space (gym.Space): observation space of the policy.
action_space (gym.Space): action space of the policy.
lock (Lock): Lock that must be held around PyTorch ops on this graph.
This is necessary when using the async sampler.
"""
def __init__(
self, observation_space, action_space, model, loss, loss_inputs):
"""Build a policy graph from policy and loss torch modules.
Note that module inputs will be CPU tensors. The model and loss modules
are responsible for moving inputs to the right device.
Arguments:
observation_space (gym.Space): observation space of the policy.
action_space (gym.Space): action space of the policy.
model (nn.Module): PyTorch policy module. Given observations as
input, this module must a list of outputs where the first item
are action logits, and the remainder can be any value.
loss (nn.Module): Loss defined as a PyTorch module. The inputs for
this module are defined by the `loss_inputs` param. This module
returns a single scalar loss.
loss_inputs (list): List of SampleBatch columns that will be
passed to the loss module's forward() function when computing
the loss. For example, ["obs", "action", "advantages"].
"""
self.observation_space = observation_space
self.action_space = action_space
self.lock = Lock()
self._model = model
self._loss = loss
self._loss_inputs = loss_inputs
self._optimizer = self.optimizer()
def extra_action_out(self, model_out):
"""Returns dict of extra info to include in experience batch.
Arguments:
model_out (list): Outputs of the policy model module."""
return {}
def optimizer(self):
"""Custom PyTorch optimizer to use."""
return torch.optim.Adam(self._model.parameters())
def compute_actions(
self, obs_batch, state_batches=None, is_training=False):
if state_batches:
raise NotImplementedError("Torch RNN support")
with self.lock:
with torch.no_grad():
ob = torch.from_numpy(np.array(obs_batch)).float()
model_out = self._model(ob)
logits = model_out[0] # assume the first output is the logits
actions = F.softmax(logits, dim=1).multinomial(1).squeeze(0)
return var_to_np(actions), [], self.extra_action_out(model_out)
def compute_gradients(self, postprocessed_batch):
with self.lock:
loss_in = []
for key in self._loss_inputs:
loss_in.append(torch.from_numpy(postprocessed_batch[key]))
loss_out = self._loss(*loss_in)
self._optimizer.zero_grad()
loss_out.backward()
# Note that return values are just references;
# calling zero_grad will modify the values
grads = [var_to_np(p.grad.data) for p in self._model.parameters()]
return grads, {}
def apply_gradients(self, gradients):
with self.lock:
for g, p in zip(gradients, self._model.parameters()):
p.grad = torch.from_numpy(g)
self._optimizer.step()
return {}
def get_weights(self):
with self.lock:
return self._model.state_dict()
def set_weights(self, weights):
with self.lock:
self._model.load_state_dict(weights)