[rllib] Refactor rllib to have a common sample collection pathway (#2149)

This commit is contained in:
Eric Liang
2018-06-09 00:21:35 -07:00
committed by Richard Liaw
parent cb5e6e6d68
commit 71eb558eb0
54 changed files with 1981 additions and 2192 deletions
+9
View File
@@ -6,6 +6,11 @@ from __future__ import print_function
# This file is imported from the tune module in order to register RLlib agents.
from ray.tune.registry import register_trainable
from ray.rllib.utils.policy_graph import PolicyGraph
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
from ray.rllib.optimizers.sample_batch import SampleBatch
def _register_all():
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
@@ -16,3 +21,7 @@ def _register_all():
_register_all()
__all__ = [
"PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
]
+49 -53
View File
@@ -2,7 +2,6 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import pickle
import os
@@ -10,14 +9,14 @@ import ray
from ray.rllib.agent import Agent
from ray.rllib.optimizers import AsyncOptimizer
from ray.rllib.utils import FilterManager
from ray.rllib.a3c.a3c_evaluator import A3CEvaluator, RemoteA3CEvaluator, \
GPURemoteA3CEvaluator
from ray.tune.result import TrainingResult
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
collect_metrics
from ray.rllib.a3c.common import get_policy_cls
from ray.tune.trial import Resources
DEFAULT_CONFIG = {
# Number of workers (excluding master)
"num_workers": 4,
"num_workers": 2,
# Size of rollout batch
"batch_size": 10,
# Use LSTM model - only applicable for image states
@@ -42,6 +41,8 @@ DEFAULT_CONFIG = {
"entropy_coeff": -0.01,
# Whether to place workers on GPUs
"use_gpu_for_workers": False,
# Whether to emit extra summary stats
"summarize": False,
# Model and preprocessor options
"model": {
# (Image statespace) - Converts image to Channels = 1
@@ -78,56 +79,48 @@ class A3CAgent(Agent):
extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)
def _init(self):
self.local_evaluator = A3CEvaluator(
self.registry,
self.env_creator,
self.config,
self.logdir,
start_sampler=False)
if self.config["use_gpu_for_workers"]:
remote_cls = GPURemoteA3CEvaluator
self.policy_cls = get_policy_cls(self.config)
if self.config["use_pytorch"]:
session_creator = None
else:
remote_cls = RemoteA3CEvaluator
import tensorflow as tf
def session_creator():
return tf.Session(
config=tf.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=1,
gpu_options=tf.GPUOptions(allow_growth=True)))
remote_cls = CommonPolicyEvaluator.as_remote(
num_gpus=1 if self.config["use_gpu_for_workers"] else 0)
self.local_evaluator = CommonPolicyEvaluator(
self.env_creator, self.policy_cls,
batch_steps=self.config["batch_size"],
batch_mode="truncate_episodes",
tf_session_creator=session_creator,
registry=self.registry, env_config=self.config["env_config"],
model_config=self.config["model"], policy_config=self.config)
self.remote_evaluators = [
remote_cls.remote(self.registry, self.env_creator, self.config,
self.logdir)
for i in range(self.config["num_workers"])
]
self.optimizer = AsyncOptimizer(self.config["optimizer"],
self.local_evaluator,
self.remote_evaluators)
remote_cls.remote(
self.env_creator, self.policy_cls,
batch_steps=self.config["batch_size"],
batch_mode="truncate_episodes", sample_async=True,
tf_session_creator=session_creator,
registry=self.registry, env_config=self.config["env_config"],
model_config=self.config["model"], policy_config=self.config)
for i in range(self.config["num_workers"])]
self.optimizer = AsyncOptimizer(
self.config["optimizer"], self.local_evaluator,
self.remote_evaluators)
def _train(self):
self.optimizer.step()
FilterManager.synchronize(self.local_evaluator.filters,
self.remote_evaluators)
res = self._fetch_metrics_from_remote_evaluators()
return res
def _fetch_metrics_from_remote_evaluators(self):
episode_rewards = []
episode_lengths = []
metric_lists = [
a.get_completed_rollout_metrics.remote()
for a in self.remote_evaluators
]
for metrics in metric_lists:
for episode in ray.get(metrics):
episode_lengths.append(episode.episode_length)
episode_rewards.append(episode.episode_reward)
avg_reward = (np.mean(episode_rewards)
if episode_rewards else float('nan'))
avg_length = (np.mean(episode_lengths)
if episode_lengths else float('nan'))
timesteps = np.sum(episode_lengths) if episode_lengths else 0
result = TrainingResult(
episode_reward_mean=avg_reward,
episode_len_mean=avg_length,
timesteps_this_iter=timesteps,
info={})
return result
FilterManager.synchronize(
self.local_evaluator.filters, self.remote_evaluators)
return collect_metrics(self.local_evaluator, self.remote_evaluators)
def _stop(self):
# workaround for https://github.com/ray-project/ray/issues/1516
@@ -154,7 +147,10 @@ class A3CAgent(Agent):
])
self.local_evaluator.restore(extra_data["local_state"])
def compute_action(self, observation):
def compute_action(self, observation, state=None):
if state is None:
state = []
obs = self.local_evaluator.obs_filter(observation, update=False)
action, info = self.local_evaluator.policy.compute(obs)
return action
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
obs, state, is_training=False)[0])
-119
View File
@@ -1,119 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pickle
import ray
from ray.rllib.models import ModelCatalog
from ray.rllib.optimizers import PolicyEvaluator
from ray.rllib.a3c.common import get_policy_cls
from ray.rllib.utils.filter import get_filter
from ray.rllib.utils.sampler import AsyncSampler
from ray.rllib.utils.process_rollout import process_rollout
class A3CEvaluator(PolicyEvaluator):
"""Actor object to start running simulation on workers.
The gradient computation is also executed from this object.
Attributes:
policy: Copy of graph used for policy. Used by sampler and gradients.
obs_filter: Observation filter used in environment sampling
rew_filter: Reward filter used in rollout post-processing.
sampler: Component for interacting with environment and generating
rollouts.
logdir: Directory for logging.
"""
def __init__(
self, registry, env_creator, config, logdir, start_sampler=True):
env = ModelCatalog.get_preprocessor_as_wrapper(
registry, env_creator(config["env_config"]), config["model"])
self.env = env
policy_cls = get_policy_cls(config)
# TODO(rliaw): should change this to be just env.observation_space
self.policy = policy_cls(
registry, env.observation_space.shape, env.action_space, config)
self.config = config
# Technically not needed when not remote
self.obs_filter = get_filter(
config["observation_filter"], env.observation_space.shape)
self.rew_filter = get_filter(config["reward_filter"], ())
self.filters = {"obs_filter": self.obs_filter,
"rew_filter": self.rew_filter}
self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
config["batch_size"])
if start_sampler and self.sampler._async:
self.sampler.start()
self.logdir = logdir
def sample(self):
rollout = self.sampler.get_data()
samples = process_rollout(
rollout, self.rew_filter, gamma=self.config["gamma"],
lambda_=self.config["lambda"], use_gae=True)
return samples
def get_completed_rollout_metrics(self):
"""Returns metrics on previously completed rollouts.
Calling this clears the queue of completed rollout metrics.
"""
return self.sampler.get_metrics()
def compute_gradients(self, samples):
gradient, info = self.policy.compute_gradients(samples)
return gradient, {}
def apply_gradients(self, grads):
self.policy.apply_gradients(grads)
def get_weights(self):
return self.policy.get_weights()
def set_weights(self, params):
self.policy.set_weights(params)
def save(self):
filters = self.get_filters(flush_after=True)
weights = self.get_weights()
return pickle.dumps({
"filters": filters,
"weights": weights})
def restore(self, objs):
objs = pickle.loads(objs)
self.sync_filters(objs["filters"])
self.set_weights(objs["weights"])
def sync_filters(self, new_filters):
"""Changes self's filter to given and rebases any accumulated delta.
Args:
new_filters (dict): Filters with new state to update local copy.
"""
assert all(k in new_filters for k in self.filters)
for k in self.filters:
self.filters[k].sync(new_filters[k])
def get_filters(self, flush_after=False):
"""Returns a snapshot of filters.
Args:
flush_after (bool): Clears the filter buffer state.
Returns:
return_filters (dict): Dict for serializable filters
"""
return_filters = {}
for k, f in self.filters.items():
return_filters[k] = f.as_serializable()
if flush_after:
f.clear_buffer()
return return_filters
RemoteA3CEvaluator = ray.remote(A3CEvaluator)
GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator)
+103
View File
@@ -0,0 +1,103 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import gym
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
class A3CTFPolicyGraph(TFPolicyGraph):
"""The TF policy base class."""
def __init__(self, ob_space, action_space, registry, config):
self.registry = registry
self.local_steps = 0
self.config = config
self.summarize = config.get("summarize")
self._setup_graph(ob_space, action_space)
assert all(hasattr(self, attr)
for attr in ["vf", "logits", "x", "var_list"])
print("Setting up loss")
self.setup_loss(action_space)
self.is_training = tf.placeholder_with_default(True, ())
self.sess = tf.get_default_session()
TFPolicyGraph.__init__(
self, self.sess, obs_input=self.x,
action_sampler=self.action_dist.sample(), loss=self.loss,
loss_inputs=self.loss_in, is_training=self.is_training,
state_inputs=self.state_in, state_outputs=self.state_out)
self.sess.run(tf.global_variables_initializer())
if self.summarize:
bs = tf.to_float(tf.shape(self.x)[0])
tf.summary.scalar("model/policy_graph", self.pi_loss / bs)
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
tf.summary.scalar("model/entropy", self.entropy / bs)
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
self.summary_op = tf.summary.merge_all()
def _setup_graph(self, ob_space, ac_space):
raise NotImplementedError
def setup_loss(self, action_space):
if isinstance(action_space, gym.spaces.Box):
ac_size = action_space.shape[0]
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
elif isinstance(action_space, gym.spaces.Discrete):
self.ac = tf.placeholder(tf.int64, [None], name="ac")
else:
raise UnsupportedSpaceException(
"Action space {} is not supported for A3C.".format(
action_space))
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.r = tf.placeholder(tf.float32, [None], name="r")
log_prob = self.action_dist.logp(self.ac)
# The "policy gradients" loss: its derivative is precisely the policy
# gradient. Notice that self.ac is a placeholder that is provided
# externally. adv will contain the advantages, as calculated in
# compute_advantages.
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
delta = self.vf - self.r
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.entropy = tf.reduce_sum(self.action_dist.entropy())
self.loss = (self.pi_loss +
self.vf_loss * self.config["vf_loss_coeff"] +
self.entropy * self.config["entropy_coeff"])
def optimizer(self):
return tf.train.AdamOptimizer(self.config["lr"])
def gradients(self, optimizer):
grads = tf.gradients(self.loss, self.var_list)
self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
clipped_grads = list(zip(self.grads, self.var_list))
return clipped_grads
def extra_compute_grad_fetches(self):
if self.summarize:
return {"summary": self.summary_op}
else:
return {}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
next_state = []
for i in range(len(self.state_in)):
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
last_r = self.value(sample_batch["new_obs"][-1], *next_state)
return compute_advantages(
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
+113
View File
@@ -0,0 +1,113 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from threading import Lock
import torch
import torch.nn.functional as F
from ray.rllib.models.pytorch.misc import var_to_np, convert_batch
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.policy_graph import PolicyGraph
class SharedTorchPolicy(PolicyGraph):
"""A simple, non-recurrent PyTorch policy example."""
def __init__(self, obs_space, action_space, registry, config):
self.registry = registry
self.local_steps = 0
self.config = config
self.summarize = config.get("summarize")
self.setup_graph(obs_space, action_space)
torch.set_num_threads(2)
self.lock = Lock()
def setup_graph(self, obs_space, action_space):
_, self.logit_dim = ModelCatalog.get_action_dist(action_space)
self._model = ModelCatalog.get_torch_model(
self.registry, obs_space.shape, self.logit_dim,
self.config["model"])
self.optimizer = torch.optim.Adam(
self._model.parameters(), lr=self.config["lr"])
def compute_single_action(self, obs, state, is_training=False):
assert not state, "RNN not supported"
with self.lock:
ob = torch.from_numpy(obs).float().unsqueeze(0)
logits, values = self._model(ob)
samples = F.softmax(logits, dim=1).multinomial(1).squeeze()
values = values.squeeze()
return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
def compute_gradients(self, samples):
with self.lock:
self.backward(samples)
# Note that return values are just references;
# calling zero_grad will modify the values
return [p.grad.data.numpy() for p in self._model.parameters()], {}
def apply_gradients(self, grads):
self.optimizer.zero_grad()
for g, p in zip(grads, self._model.parameters()):
p.grad = torch.from_numpy(g)
self.optimizer.step()
return {}
def get_weights(self):
# !! This only returns references to the data.
return self._model.state_dict()
def set_weights(self, weights):
with self.lock:
self._model.load_state_dict(weights)
def value(self, obs):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0)
res = self._model.hidden_layers(obs)
res = self._model.value_branch(res)
res = res.squeeze()
return var_to_np(res)
def forward(self, obs_batch, actions):
logits, values = self._model(obs_batch)
log_probs = F.log_softmax(logits, dim=1)
probs = F.softmax(logits, dim=1)
action_log_probs = log_probs.gather(1, actions.view(-1, 1))
entropy = -(log_probs * probs).sum(-1).sum()
return values, action_log_probs, entropy
def backward(self, sample_batch):
"""Loss is encoded here.
Defining a new loss function would start by rewriting this function.
"""
states, actions, advs, rs = convert_batch(sample_batch)
values, action_log_probs, entropy = self.forward(states, actions)
pi_err = -advs.dot(action_log_probs.reshape(-1))
value_err = F.mse_loss(values.reshape(-1), rs)
self.optimizer.zero_grad()
overall_err = sum([
pi_err,
self.config["vf_loss_coeff"] * value_err,
self.config["entropy_coeff"] * entropy,
])
overall_err.backward()
torch.nn.utils.clip_grad_norm_(self._model.parameters(),
self.config["grad_clip"])
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
last_r = self.value(sample_batch["new_obs"][-1])
return compute_advantages(
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
+1 -1
View File
@@ -8,7 +8,7 @@ def get_policy_cls(config):
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
policy_cls = SharedModelLSTM
elif config["use_pytorch"]:
from ray.rllib.a3c.shared_torch_policy import SharedTorchPolicy
from ray.rllib.a3c.a3c_torch_policy import SharedTorchPolicy
policy_cls = SharedTorchPolicy
else:
from ray.rllib.a3c.shared_model import SharedModel
-28
View File
@@ -1,28 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class Policy(object):
"""The policy base class."""
def __init__(self, ob_space, action_space, name="local", summarize=True):
pass
def apply_gradients(self, grads):
raise NotImplementedError
def get_weights(self):
raise NotImplementedError
def set_weights(self, weights):
raise NotImplementedError
def compute_gradients(self, samples):
raise NotImplementedError
def compute(self, observations):
"""Compute action for a _single_ observation"""
raise NotImplementedError
def value(self, ob):
raise NotImplementedError
+20 -31
View File
@@ -4,30 +4,27 @@ from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.misc import linear, normc_initializer
from ray.rllib.a3c.tfpolicy import TFPolicy
from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
from ray.rllib.models.catalog import ModelCatalog
class SharedModel(TFPolicy):
class SharedModel(A3CTFPolicyGraph):
other_output = ["vf_preds"]
is_recurrent = False
def __init__(self, registry, ob_space, ac_space, config, **kwargs):
def __init__(self, ob_space, ac_space, registry, config, **kwargs):
super(SharedModel, self).__init__(
registry, ob_space, ac_space, config, **kwargs)
ob_space, ac_space, registry, config, **kwargs)
def _setup_graph(self, ob_space, ac_space):
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
self._model = ModelCatalog.get_model(
self.registry, self.x, self.logit_dim, self.config["model"])
self.logits = self._model.outputs
self.curr_dist = dist_class(self.logits)
self.action_dist = dist_class(self.logits)
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
normc_initializer(1.0)), [-1])
self.sample = self.curr_dist.sample()
self.sample = self.action_dist.sample()
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
self.global_step = tf.get_variable(
@@ -35,28 +32,20 @@ class SharedModel(TFPolicy):
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
def compute_gradients(self, samples):
info = {}
feed_dict = {
self.x: samples["obs"],
self.ac: samples["actions"],
self.adv: samples["advantages"],
self.r: samples["value_targets"],
}
self.grads = [g for g in self.grads if g is not None]
self.local_steps += 1
if self.summarize:
grad, summ = self.sess.run([self.grads, self.summary_op],
feed_dict=feed_dict)
info['summary'] = summ
else:
grad = self.sess.run(self.grads, feed_dict=feed_dict)
return grad, info
self.state_in = []
self.state_out = []
def compute(self, ob, *args):
action, vf = self.sess.run([self.sample, self.vf],
{self.x: [ob]})
return action[0], {"vf_preds": vf[0]}
def setup_loss(self, action_space):
A3CTFPolicyGraph.setup_loss(self, action_space)
self.loss_in = [
("obs", self.x),
("actions", self.ac),
("advantages", self.adv),
("value_targets", self.r),
]
def extra_compute_action_fetches(self):
return {"vf_preds": self.vf}
def value(self, ob, *args):
vf = self.sess.run(self.vf, {self.x: [ob]})
+21 -49
View File
@@ -5,43 +5,32 @@ from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.misc import linear, normc_initializer
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.a3c.tfpolicy import TFPolicy
from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
from ray.rllib.models.lstm import LSTM
class SharedModelLSTM(TFPolicy):
"""
Attributes:
other_output (list): Other than `action`, the other return values from
`compute_gradients`.
is_recurrent (bool): True if is a recurrent network (requires features
to be tracked).
"""
class SharedModelLSTM(A3CTFPolicyGraph):
other_output = ["vf_preds", "features"]
is_recurrent = True
def __init__(self, registry, ob_space, ac_space, config, **kwargs):
def __init__(self, ob_space, ac_space, registry, config, **kwargs):
super(SharedModelLSTM, self).__init__(
registry, ob_space, ac_space, config, **kwargs)
ob_space, ac_space, registry, config, **kwargs)
def _setup_graph(self, ob_space, ac_space):
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
self._model = LSTM(self.x, self.logit_dim, {})
self.state_init = self._model.state_init
self.state_in = self._model.state_in
self.state_out = self._model.state_out
self.logits = self._model.outputs
self.curr_dist = dist_class(self.logits)
self.action_dist = dist_class(self.logits)
# with tf.variable_scope("vf"):
# vf_model = ModelCatalog.get_model(self.x, 1)
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
normc_initializer(1.0)), [-1])
self.sample = self.curr_dist.sample()
self.sample = self.action_dist.sample()
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
self.global_step = tf.get_variable(
@@ -49,42 +38,25 @@ class SharedModelLSTM(TFPolicy):
initializer=tf.constant_initializer(0, dtype=tf.int32),
trainable=False)
def compute_gradients(self, samples):
"""Computing the gradient is actually model-dependent.
def get_initial_state(self):
return self._model.state_init
The LSTM needs its hidden states in order to compute the gradient
accurately.
"""
features = samples["features"][0]
feed_dict = {
self.x: samples["obs"],
self.ac: samples["actions"],
self.adv: samples["advantages"],
self.r: samples["value_targets"],
self.state_in[0]: features[0],
self.state_in[1]: features[1]
}
info = {}
self.local_steps += 1
if self.summarize and self.local_steps % 10 == 0:
grad, summ = self.sess.run([self.grads, self.summary_op],
feed_dict=feed_dict)
info['summary'] = summ
else:
grad = self.sess.run(self.grads, feed_dict=feed_dict)
return grad, info
def setup_loss(self, action_space):
A3CTFPolicyGraph.setup_loss(self, action_space)
self.loss_in = [
("obs", self.x),
("actions", self.ac),
("advantages", self.adv),
("value_targets", self.r),
("state_in_0", self.state_in[0]),
("state_in_1", self.state_in[1]),
]
def compute(self, ob, c, h):
action, vf, c, h = self.sess.run(
[self.sample, self.vf] + self.state_out,
{self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
return action[0], {"vf_preds": vf[0], "features": (c, h)}
def extra_compute_action_fetches(self):
return {"vf_preds": self.vf}
def value(self, ob, c, h):
vf = self.sess.run(self.vf, {self.x: [ob],
self.state_in[0]: c,
self.state_in[1]: h})
return vf[0]
def get_initial_features(self):
return self.state_init
-106
View File
@@ -1,106 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import ray
import gym
from ray.rllib.a3c.policy import Policy
class TFPolicy(Policy):
"""The policy base class."""
def __init__(self, registry, ob_space, action_space, config,
name="local", summarize=True):
self.registry = registry
self.local_steps = 0
self.config = config
self.summarize = summarize
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
self.g = tf.Graph()
with self.g.as_default(), tf.device(worker_device):
with tf.variable_scope(name):
self._setup_graph(ob_space, action_space)
assert all(hasattr(self, attr)
for attr in ["vf", "logits", "x", "var_list"])
print("Setting up loss")
self.setup_loss(action_space)
self.setup_gradients()
self.initialize()
def _setup_graph(self, ob_space, ac_space):
raise NotImplementedError
def setup_loss(self, action_space):
if isinstance(action_space, gym.spaces.Box):
ac_size = action_space.shape[0]
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
elif isinstance(action_space, gym.spaces.Discrete):
self.ac = tf.placeholder(tf.int64, [None], name="ac")
else:
raise NotImplementedError(
"action space" + str(type(action_space)) +
"currently not supported")
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.r = tf.placeholder(tf.float32, [None], name="r")
log_prob = self.curr_dist.logp(self.ac)
# The "policy gradients" loss: its derivative is precisely the policy
# gradient. Notice that self.ac is a placeholder that is provided
# externally. adv will contain the advantages, as calculated in
# process_rollout.
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
delta = self.vf - self.r
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.entropy = tf.reduce_sum(self.curr_dist.entropy())
self.loss = (self.pi_loss +
self.vf_loss * self.config["vf_loss_coeff"] +
self.entropy * self.config["entropy_coeff"])
def setup_gradients(self):
grads = tf.gradients(self.loss, self.var_list)
self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
grads_and_vars = list(zip(self.grads, self.var_list))
opt = tf.train.AdamOptimizer(self.config["lr"])
self._apply_gradients = opt.apply_gradients(grads_and_vars)
def initialize(self):
if self.summarize:
bs = tf.to_float(tf.shape(self.x)[0])
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
tf.summary.scalar("model/entropy", self.entropy / bs)
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
self.summary_op = tf.summary.merge_all()
# TODO(rliaw): Can consider exposing these parameters
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
gpu_options=tf.GPUOptions(allow_growth=True)))
self.variables = ray.experimental.TensorFlowVariables(self.loss,
self.sess)
self.sess.run(tf.global_variables_initializer())
def apply_gradients(self, grads):
feed_dict = {self.grads[i]: grads[i]
for i in range(len(grads))}
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
def get_weights(self):
weights = self.variables.get_weights()
return weights
def set_weights(self, weights):
self.variables.set_weights(weights)
def compute_gradients(self, samples):
raise NotImplementedError
def compute(self, observation):
raise NotImplementedError
def value(self, ob):
raise NotImplementedError
+1 -1
View File
@@ -61,7 +61,7 @@ class Agent(Trainable):
"""
_allow_unknown_configs = False
_allow_unknown_subkeys = []
_allow_unknown_subkeys = ["env_config", "model", "optimizer"]
@classmethod
def resource_help(cls, config):
+1 -2
View File
@@ -17,8 +17,7 @@ class BCEvaluator(PolicyEvaluator):
env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator(
config["env_config"]), config["model"])
self.dataset = ExperienceDataset(config["dataset_path"])
# TODO(rliaw): should change this to be just env.observation_space
self.policy = BCPolicy(registry, env.observation_space.shape,
self.policy = BCPolicy(registry, env.observation_space,
env.action_space, config)
self.config = config
self.logdir = logdir
+9 -17
View File
@@ -6,30 +6,22 @@ import tensorflow as tf
import gym
import ray
from ray.rllib.a3c.policy import Policy
from ray.rllib.models.catalog import ModelCatalog
class BCPolicy(Policy):
def __init__(self, registry, ob_space, action_space, config, name="local",
summarize=True):
super(BCPolicy, self).__init__(ob_space, action_space, name, summarize)
class BCPolicy(object):
def __init__(self, registry, obs_space, action_space, config):
self.registry = registry
self.local_steps = 0
self.config = config
self.summarize = summarize
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
self.g = tf.Graph()
with self.g.as_default(), tf.device(worker_device):
with tf.variable_scope(name):
self._setup_graph(ob_space, action_space)
print("Setting up loss")
self.setup_loss(action_space)
self.setup_gradients()
self.initialize()
self.summarize = config.get("summarize")
self._setup_graph(obs_space, action_space)
self.setup_loss(action_space)
self.setup_gradients()
self.initialize()
def _setup_graph(self, ob_space, ac_space):
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
def _setup_graph(self, obs_space, ac_space):
self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
self._model = ModelCatalog.get_model(
self.registry, self.x, self.logit_dim, self.config["model"])
+17 -17
View File
@@ -8,25 +8,25 @@ from ray.utils import merge_dicts
APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
DDPG_CONFIG,
{
'optimizer_class': 'ApexOptimizer',
'optimizer_config':
"optimizer_class": "ApexOptimizer",
"optimizer_config":
merge_dicts(
DDPG_CONFIG['optimizer_config'], {
'max_weight_sync_delay': 400,
'num_replay_buffer_shards': 4,
'debug': False
DDPG_CONFIG["optimizer_config"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
'n_step': 3,
'num_workers': 32,
'buffer_size': 2000000,
'learning_starts': 50000,
'train_batch_size': 512,
'sample_batch_size': 50,
'max_weight_sync_delay': 400,
'target_network_update_freq': 500000,
'timesteps_per_iteration': 25000,
'per_worker_exploration': True,
'worker_side_prioritization': True,
"n_step": 3,
"num_workers": 32,
"buffer_size": 2000000,
"learning_starts": 50000,
"train_batch_size": 512,
"sample_batch_size": 50,
"max_weight_sync_delay": 400,
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"per_worker_exploration": True,
"worker_side_prioritization": True,
},
)
+60 -195
View File
@@ -2,17 +2,9 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pickle
import os
import numpy as np
import tensorflow as tf
import ray
from ray.rllib import optimizers
from ray.rllib.ddpg.ddpg_evaluator import DDPGEvaluator
from ray.rllib.agent import Agent
from ray.tune.result import TrainingResult
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
from ray.rllib.dqn.dqn import DQNAgent
from ray.rllib.ddpg.ddpg_policy_graph import DDPGPolicyGraph
OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
@@ -23,247 +15,120 @@ OPTIMIZER_SHARED_CONFIGS = [
DEFAULT_CONFIG = {
# === Model ===
# Hidden layer sizes of the policy networks
'actor_hiddens': [64, 64],
"actor_hiddens": [64, 64],
# Hidden layer sizes of the policy networks
'critic_hiddens': [64, 64],
"critic_hiddens": [64, 64],
# N-step Q learning
'n_step': 1,
"n_step": 1,
# Config options to pass to the model constructor
'model': {},
"model": {},
# Discount factor for the MDP
'gamma': 0.99,
"gamma": 0.99,
# Arguments to pass to the env creator
'env_config': {},
"env_config": {},
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
# exploration_fraction
'schedule_max_timesteps': 100000,
"schedule_max_timesteps": 100000,
# Number of env steps to optimize for before returning
'timesteps_per_iteration': 1000,
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
'exploration_fraction': 0.1,
"exploration_fraction": 0.1,
# Final value of random action probability
'exploration_final_eps': 0.02,
"exploration_final_eps": 0.02,
# OU-noise scale
'noise_scale': 0.1,
"noise_scale": 0.1,
# theta
'exploration_theta': 0.15,
"exploration_theta": 0.15,
# sigma
'exploration_sigma': 0.2,
"exploration_sigma": 0.2,
# Update the target network every `target_network_update_freq` steps.
'target_network_update_freq': 0,
"target_network_update_freq": 0,
# Update the target by \tau * policy + (1-\tau) * target_policy
'tau': 0.002,
# Whether to start with random actions instead of noops.
'random_starts': True,
"tau": 0.002,
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
'buffer_size': 50000,
"buffer_size": 50000,
# If True prioritized replay buffer will be used.
'prioritized_replay': True,
"prioritized_replay": True,
# Alpha parameter for prioritized replay buffer.
'prioritized_replay_alpha': 0.6,
"prioritized_replay_alpha": 0.6,
# Beta parameter for sampling from prioritized replay buffer.
'prioritized_replay_beta': 0.4,
"prioritized_replay_beta": 0.4,
# Epsilon to add to the TD errors when updating priorities.
'prioritized_replay_eps': 1e-6,
"prioritized_replay_eps": 1e-6,
# Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
'clip_rewards': True,
"clip_rewards": True,
# === Optimization ===
# Learning rate for adam optimizer
'actor_lr': 1e-4,
'critic_lr': 1e-3,
"actor_lr": 1e-4,
"critic_lr": 1e-3,
# If True, use huber loss instead of squared loss for critic network
# Conventionally, no need to clip gradients if using a huber loss
'use_huber': False,
"use_huber": False,
# Threshold of a huber loss
'huber_threshold': 1.0,
"huber_threshold": 1.0,
# Weights for L2 regularization
'l2_reg': 1e-6,
"l2_reg": 1e-6,
# If not None, clip gradients during optimization at this value
'grad_norm_clipping': None,
"grad_norm_clipping": None,
# How many steps of the model to sample before learning starts.
'learning_starts': 1500,
"learning_starts": 1500,
# Update the replay buffer with this many samples at once. Note that this
# setting applies per-worker if num_workers > 1.
'sample_batch_size': 1,
"sample_batch_size": 1,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
'train_batch_size': 256,
# Smooth the current average reward over this many previous episodes.
'smoothing_num_episodes': 100,
# === Tensorflow ===
# Arguments to pass to tensorflow
'tf_session_args': {
"device_count": {
"CPU": 2
},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
},
"train_batch_size": 256,
# === Parallelism ===
# Whether to use a GPU for local optimization.
"gpu": False,
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you're using the Async or Ape-X optimizers.
'num_workers': 0,
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to allocate GPUs for workers (if > 0).
'num_gpus_per_worker': 0,
"num_gpus_per_worker": 0,
# Whether to allocate CPUs for workers (if > 0).
"num_cpus_per_worker": 1,
# Optimizer class to use.
'optimizer_class': "LocalSyncReplayOptimizer",
"optimizer_class": "LocalSyncReplayOptimizer",
# Config to pass to the optimizer.
'optimizer_config': {},
"optimizer_config": {},
# Whether to use a distribution of epsilons across workers for exploration.
'per_worker_exploration': False,
"per_worker_exploration": False,
# Whether to compute priorities on workers.
'worker_side_prioritization': False
"worker_side_prioritization": False
}
class DDPGAgent(Agent):
class DDPGAgent(DQNAgent):
_agent_name = "DDPG"
_allow_unknown_subkeys = [
"model", "optimizer", "tf_session_args", "env_config"
]
"model", "optimizer", "tf_session_args", "env_config"]
_default_config = DEFAULT_CONFIG
_policy_graph = DDPGPolicyGraph
def _init(self):
self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator,
self.config, self.logdir, 0)
remote_cls = ray.remote(
num_cpus=1,
num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator)
self.remote_evaluators = [
remote_cls.remote(self.registry, self.env_creator, self.config,
self.logdir, i)
for i in range(self.config["num_workers"])
]
for k in OPTIMIZER_SHARED_CONFIGS:
if k not in self.config["optimizer_config"]:
self.config["optimizer_config"][k] = self.config[k]
self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
self.config["optimizer_config"], self.local_evaluator,
self.remote_evaluators)
self.saver = tf.train.Saver(max_to_keep=None)
self.last_target_update_ts = 0
self.num_target_updates = 0
@property
def global_timestep(self):
return self.optimizer.num_steps_sampled
def update_target_if_needed(self):
if self.global_timestep - self.last_target_update_ts > \
self.config["target_network_update_freq"]:
self.local_evaluator.update_target()
self.last_target_update_ts = self.global_timestep
self.num_target_updates += 1
def _train(self):
start_timestep = self.global_timestep
while (self.global_timestep - start_timestep <
self.config["timesteps_per_iteration"]):
self.optimizer.step()
self.update_target_if_needed()
self.local_evaluator.set_global_timestep(self.global_timestep)
for e in self.remote_evaluators:
e.set_global_timestep.remote(self.global_timestep)
return self._train_stats(start_timestep)
def _train_stats(self, start_timestep):
if self.remote_evaluators:
stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
else:
stats = self.local_evaluator.stats()
if not isinstance(stats, list):
stats = [stats]
mean_100ep_reward = 0.0
mean_100ep_length = 0.0
num_episodes = 0
explorations = []
def _make_exploration_schedule(self, worker_index):
# Override DQN's schedule to take into account `noise_scale`
if self.config["per_worker_exploration"]:
# Return stats from workers with the lowest 20% of exploration
test_stats = stats[-int(max(1, len(stats) * 0.2)):]
assert self.config["num_workers"] > 1, \
"This requires multiple workers"
return ConstantSchedule(
self.config["noise_scale"] * 0.4 **
(1 + worker_index / float(self.config["num_workers"] - 1) * 7))
else:
test_stats = stats
for s in test_stats:
mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
for s in stats:
num_episodes += s["num_episodes"]
explorations.append(s["exploration"])
opt_stats = self.optimizer.stats()
result = TrainingResult(
episode_reward_mean=mean_100ep_reward,
episode_len_mean=mean_100ep_length,
episodes_total=num_episodes,
timesteps_this_iter=self.global_timestep - start_timestep,
info=dict({
"min_exploration": min(explorations),
"max_exploration": max(explorations),
"num_target_updates": self.num_target_updates,
}, **opt_stats))
return result
def _stop(self):
# workaround for https://github.com/ray-project/ray/issues/1516
for ev in self.remote_evaluators:
ev.__ray_terminate__.remote()
def _save(self, checkpoint_dir):
checkpoint_path = self.saver.save(
self.local_evaluator.sess,
os.path.join(checkpoint_dir, "checkpoint"),
global_step=self.iteration)
extra_data = [
self.local_evaluator.save(),
ray.get([e.save.remote() for e in self.remote_evaluators]),
self.optimizer.save(), self.num_target_updates,
self.last_target_update_ts
]
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
return checkpoint_path
def _restore(self, checkpoint_path):
self.saver.restore(self.local_evaluator.sess, checkpoint_path)
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
self.local_evaluator.restore(extra_data[0])
ray.get([
e.restore.remote(d)
for (d, e) in zip(extra_data[1], self.remote_evaluators)
])
self.optimizer.restore(extra_data[2])
self.num_target_updates = extra_data[3]
self.last_target_update_ts = extra_data[4]
def compute_action(self, observation):
return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess,
np.array(observation)[None],
0.0)[0]
return LinearSchedule(
schedule_timesteps=int(self.config["exploration_fraction"] *
self.config["schedule_max_timesteps"]),
initial_p=self.config["noise_scale"] * 1.0,
final_p=self.config["noise_scale"] *
self.config["exploration_final_eps"])
-186
View File
@@ -1,186 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Box
import numpy as np
import tensorflow as tf
import ray
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.ddpg import models
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
from ray.rllib.utils.compression import pack
from ray.rllib.dqn.dqn_evaluator import adjust_nstep
from ray.rllib.dqn.common.wrappers import wrap_dqn
class DDPGEvaluator(PolicyEvaluator):
"""The base DDPG Evaluator."""
def __init__(self, registry, env_creator, config, logdir, worker_index):
env = env_creator(config["env_config"])
env = wrap_dqn(registry, env, config["model"], config["random_starts"])
self.env = env
self.config = config
# when env.action_space is of Box type, e.g., Pendulum-v0
# action_space.low is [-2.0], high is [2.0]
# take action by calling, e.g., env.step([3.5])
if not isinstance(env.action_space, Box):
raise UnsupportedSpaceException(
"Action space {} is not supported for DDPG.".format(
env.action_space))
tf_config = tf.ConfigProto(**config["tf_session_args"])
self.sess = tf.Session(config=tf_config)
self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)
# Use either a different `eps` per worker, or a linear schedule.
if config["per_worker_exploration"]:
assert config["num_workers"] > 1, "This requires multiple workers"
self.exploration = ConstantSchedule(
config["noise_scale"] * 0.4 **
(1 + worker_index / float(config["num_workers"] - 1) * 7))
else:
self.exploration = LinearSchedule(
schedule_timesteps=int(config["exploration_fraction"] *
config["schedule_max_timesteps"]),
initial_p=config["noise_scale"] * 1.0,
final_p=config["noise_scale"] *
config["exploration_final_eps"])
# Initialize the parameters and copy them to the target network.
self.sess.run(tf.global_variables_initializer())
# hard instead of soft
self.ddpg_graph.update_target(self.sess, 1.0)
self.global_timestep = 0
self.local_timestep = 0
# Note that this encompasses both the policy and Q-value networks and
# their corresponding target networks
self.variables = ray.experimental.TensorFlowVariables(
tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess)
self.episode_rewards = [0.0]
self.episode_lengths = [0.0]
self.saved_mean_reward = None
self.obs = self.env.reset()
def set_global_timestep(self, global_timestep):
self.global_timestep = global_timestep
def update_target(self):
self.ddpg_graph.update_target(self.sess)
def sample(self):
obs, actions, rewards, new_obs, dones = [], [], [], [], []
for _ in range(
self.config["sample_batch_size"] + self.config["n_step"] - 1):
ob, act, rew, ob1, done = self._step(self.global_timestep)
obs.append(ob)
actions.append(act)
rewards.append(rew)
new_obs.append(ob1)
dones.append(done)
# N-step Q adjustments
if self.config["n_step"] > 1:
# Adjust for steps lost from truncation
self.local_timestep -= (self.config["n_step"] - 1)
adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
actions, rewards, new_obs, dones)
batch = SampleBatch({
"obs": [pack(np.array(o)) for o in obs],
"actions": actions,
"rewards": rewards,
"new_obs": [pack(np.array(o)) for o in new_obs],
"dones": dones,
"weights": np.ones_like(rewards)
})
assert (batch.count == self.config["sample_batch_size"])
# Prioritize on the worker side
if self.config["worker_side_prioritization"]:
td_errors = self.ddpg_graph.compute_td_error(
self.sess, obs, batch["actions"], batch["rewards"], new_obs,
batch["dones"], batch["weights"])
new_priorities = (
np.abs(td_errors) + self.config["prioritized_replay_eps"])
batch.data["weights"] = new_priorities
return batch
def compute_gradients(self, samples):
td_err, grads = self.ddpg_graph.compute_gradients(
self.sess, samples["obs"], samples["actions"], samples["rewards"],
samples["new_obs"], samples["dones"], samples["weights"])
return grads, {"td_error": td_err}
def apply_gradients(self, grads):
self.ddpg_graph.apply_gradients(self.sess, grads)
def compute_apply(self, samples):
td_error = self.ddpg_graph.compute_apply(
self.sess, samples["obs"], samples["actions"], samples["rewards"],
samples["new_obs"], samples["dones"], samples["weights"])
return {"td_error": td_error}
def get_weights(self):
return self.variables.get_weights()
def set_weights(self, weights):
self.variables.set_weights(weights)
def _step(self, global_timestep):
"""Takes a single step, and returns the result of the step."""
action = self.ddpg_graph.act(
self.sess,
np.array(self.obs)[None],
self.exploration.value(global_timestep))[0]
new_obs, rew, done, _ = self.env.step(action)
ret = (self.obs, action, rew, new_obs, float(done))
self.obs = new_obs
self.episode_rewards[-1] += rew
self.episode_lengths[-1] += 1
if done:
self.obs = self.env.reset()
self.episode_rewards.append(0.0)
self.episode_lengths.append(0.0)
# reset UO noise for each episode
self.ddpg_graph.reset_noise(self.sess)
self.local_timestep += 1
return ret
def stats(self):
n = self.config["smoothing_num_episodes"] + 1
mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
exploration = self.exploration.value(self.global_timestep)
return {
"mean_100ep_reward": mean_100ep_reward,
"mean_100ep_length": mean_100ep_length,
"num_episodes": len(self.episode_rewards),
"exploration": exploration,
"local_timestep": self.local_timestep,
}
def save(self):
return [
self.exploration, self.episode_rewards, self.episode_lengths,
self.saved_mean_reward, self.obs, self.global_timestep,
self.local_timestep
]
def restore(self, data):
self.exploration = data[0]
self.episode_rewards = data[1]
self.episode_lengths = data[2]
self.saved_mean_reward = data[3]
self.obs = data[4]
self.global_timestep = data[5]
self.local_timestep = data[6]
+327
View File
@@ -0,0 +1,327 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Box
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
import ray
from ray.rllib.dqn.dqn_policy_graph import _huber_loss, _minimize_and_clip, \
_scope_vars, _postprocess_dqn
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
A_SCOPE = "a_func"
P_SCOPE = "p_func"
P_TARGET_SCOPE = "target_p_func"
Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
def _build_p_network(registry, inputs, dim_actions, config):
"""
map an observation (i.e., state) to an action where
each entry takes value from (0, 1) due to the sigmoid function
"""
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
hiddens = config["actor_hiddens"]
action_out = frontend.last_layer
for hidden in hiddens:
action_out = layers.fully_connected(
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
# Use sigmoid layer to bound values within (0, 1)
# shape of action_scores is [batch_size, dim_actions]
action_scores = layers.fully_connected(
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
return action_scores
# As a stochastic policy for inference, but a deterministic policy for training
# thus ignore batch_size issue when constructing a stochastic action
def _build_action_network(p_values, low_action, high_action, stochastic, eps,
theta, sigma):
# shape is [None, dim_action]
deterministic_actions = (high_action - low_action) * p_values + low_action
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=low_action.size * [.0],
trainable=False)
normal_sample = tf.random_normal(
shape=[low_action.size], mean=0.0, stddev=1.0)
exploration_value = tf.assign_add(
exploration_sample,
theta * (.0 - exploration_sample) + sigma * normal_sample)
stochastic_actions = deterministic_actions + eps * (
high_action - low_action) * exploration_value
return tf.cond(stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
def _build_q_network(registry, inputs, action_inputs, config):
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
hiddens = config["critic_hiddens"]
q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
for hidden in hiddens:
q_out = layers.fully_connected(
q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
return q_scores
class DDPGPolicyGraph(TFPolicyGraph):
def __init__(self, observation_space, action_space, registry, config):
if not isinstance(action_space, Box):
raise UnsupportedSpaceException(
"Action space {} is not supported for DDPG.".format(
action_space))
self.config = config
self.cur_epsilon = 1.0
dim_actions = action_space.shape[0]
low_action = action_space.low
high_action = action_space.high
self.actor_optimizer = tf.train.AdamOptimizer(
learning_rate=config["actor_lr"])
self.critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["critic_lr"])
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None, ) + observation_space.shape)
# Actor: P (policy) network
with tf.variable_scope(P_SCOPE) as scope:
p_values = _build_p_network(registry, self.cur_observations,
dim_actions, config)
self.p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE):
self.output_actions = _build_action_network(
p_values, low_action, high_action, self.stochastic, self.eps,
config["exploration_theta"], config["exploration_sigma"])
with tf.variable_scope(A_SCOPE, reuse=True):
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
self.reset_noise_op = tf.assign(exploration_sample,
dim_actions * [.0])
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32,
shape=(None, ) + observation_space.shape,
name="observation")
self.act_t = tf.placeholder(
tf.float32, shape=(None, ) + action_space.shape, name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None, ) + observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
# p network evaluation
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
self.p_t = _build_p_network(
registry, self.obs_t, dim_actions, config)
# target p network evaluation
with tf.variable_scope(P_TARGET_SCOPE) as scope:
p_tp1 = _build_p_network(
registry, self.obs_tp1, dim_actions, config)
target_p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope(A_SCOPE, reuse=True):
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
zero_eps = tf.constant(value=.0, dtype=tf.float32)
output_actions = _build_action_network(
self.p_t, low_action, high_action, deterministic_flag,
zero_eps, config["exploration_theta"],
config["exploration_sigma"])
output_actions_estimated = _build_action_network(
p_tp1, low_action, high_action, deterministic_flag,
zero_eps, config["exploration_theta"],
config["exploration_sigma"])
# q network evaluation
with tf.variable_scope(Q_SCOPE) as scope:
q_t = _build_q_network(
registry, self.obs_t, self.act_t, config)
self.q_func_vars = _scope_vars(scope.name)
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp0 = _build_q_network(
registry, self.obs_t, output_actions, config)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(
registry, self.obs_tp1, output_actions_estimated, config)
target_q_func_vars = _scope_vars(scope.name)
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
q_tp1_best = tf.squeeze(
input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (
self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
if config.get("use_huber"):
errors = _huber_loss(self.td_error, config.get("huber_threshold"))
else:
errors = 0.5 * tf.square(self.td_error)
self.loss = tf.reduce_mean(self.importance_weights * errors)
# for policy gradient
self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
if config["l2_reg"] is not None:
for var in self.p_func_vars:
if "bias" not in var.name:
self.actor_loss += (
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
for var in self.q_func_vars:
if "bias" not in var.name:
self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
var)
# update_target_fn will be called periodically to copy Q network to
# target Q network
self.tau_value = config.get("tau")
self.tau = tf.placeholder(tf.float32, (), name="tau")
update_target_expr = []
for var, var_target in zip(
sorted(self.q_func_vars, key=lambda v: v.name),
sorted(target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
for var, var_target in zip(
sorted(self.p_func_vars, key=lambda v: v.name),
sorted(target_p_func_vars, key=lambda v: v.name)):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
self.update_target_expr = tf.group(*update_target_expr)
self.sess = tf.get_default_session()
self.loss_inputs = [
("obs", self.obs_t),
("actions", self.act_t),
("rewards", self.rew_t),
("new_obs", self.obs_tp1),
("dones", self.done_mask),
("weights", self.importance_weights),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, self.sess, obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
# Note that this encompasses both the policy and Q-value networks and
# their corresponding target networks
self.variables = ray.experimental.TensorFlowVariables(
tf.group(q_tp0, q_tp1), self.sess)
# Hard initial update
self.update_target(tau=1.0)
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = _minimize_and_clip(
self.actor_optimizer,
self.actor_loss,
var_list=self.p_func_vars,
clip_val=self.config["grad_norm_clipping"])
critic_grads_and_vars = _minimize_and_clip(
self.critic_optimizer,
self.loss,
var_list=self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
actor_grads_and_vars = self.actor_optimizer.compute_gradients(
self.actor_loss, var_list=self.p_func_vars)
critic_grads_and_vars = self.critic_optimizer.compute_gradients(
self.loss, var_list=self.q_func_vars)
actor_grads_and_vars = [
(g, v) for (g, v) in actor_grads_and_vars if g is not None]
critic_grads_and_vars = [
(g, v) for (g, v) in critic_grads_and_vars if g is not None]
grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
return grads_and_vars
def extra_compute_action_feed_dict(self):
return {
self.stochastic: True,
self.eps: self.cur_epsilon,
}
def extra_compute_grad_fetches(self):
return {
"td_error": self.td_error,
}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return _postprocess_dqn(self, sample_batch)
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err = self.sess.run(
self.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def reset_noise(self, sess):
sess.run(self.reset_noise_op)
# support both hard and soft sync
def update_target(self, tau=None):
return self.sess.run(
self.update_target_expr,
feed_dict={self.tau: tau or self.tau_value})
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
def get_weights(self):
return self.variables.get_weights()
def set_weights(self, weights):
self.variables.set_weights(weights)
def get_state(self):
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
def set_state(self, state):
TFPolicyGraph.set_state(self, state[0])
self.set_epsilon(state[1])
-391
View File
@@ -1,391 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
from ray.rllib.models import ModelCatalog
def _build_p_network(registry, inputs, dim_actions, config):
"""
map an observation (i.e., state) to an action where
each entry takes value from (0, 1) due to the sigmoid function
"""
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
hiddens = config["actor_hiddens"]
action_out = frontend.last_layer
for hidden in hiddens:
action_out = layers.fully_connected(
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
# Use sigmoid layer to bound values within (0, 1)
# shape of action_scores is [batch_size, dim_actions]
action_scores = layers.fully_connected(
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
return action_scores
# As a stochastic policy for inference, but a deterministic policy for training
# thus ignore batch_size issue when constructing a stochastic action
def _build_action_network(p_values, low_action, high_action, stochastic, eps,
theta, sigma):
# shape is [None, dim_action]
deterministic_actions = (high_action - low_action) * p_values + low_action
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=low_action.size * [.0],
trainable=False)
normal_sample = tf.random_normal(
shape=[low_action.size], mean=0.0, stddev=1.0)
exploration_value = tf.assign_add(
exploration_sample,
theta * (.0 - exploration_sample) + sigma * normal_sample)
stochastic_actions = deterministic_actions + eps * (
high_action - low_action) * exploration_value
return tf.cond(stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
def _build_q_network(registry, inputs, action_inputs, config):
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
hiddens = config["critic_hiddens"]
q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
for hidden in hiddens:
q_out = layers.fully_connected(
q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
return q_scores
def _huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
"""Minimized `objective` using `optimizer` w.r.t. variables in
`var_list` while ensure the norm of the gradients for each
variable is clipped to `clip_val`
"""
gradients = optimizer.compute_gradients(objective, var_list=var_list)
for i, (grad, var) in enumerate(gradients):
if grad is not None:
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
return gradients
def _scope_vars(scope, trainable_only=False):
"""
Get variables inside a scope
The scope can be specified as a string
Parameters
----------
scope: str or VariableScope
scope in which the variables reside.
trainable_only: bool
whether or not to return only the variables that were marked as
trainable.
Returns
-------
vars: [tf.Variable]
list of variables in `scope`.
"""
return tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES
if trainable_only else tf.GraphKeys.VARIABLES,
scope=scope if isinstance(scope, str) else scope.name)
class ModelAndLoss(object):
"""Holds the model and loss function.
Both graphs are necessary in order for the multi-gpu SGD implementation
to create towers on each device.
"""
def __init__(self, registry, dim_actions, low_action, high_action, config,
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
# p network evaluation
with tf.variable_scope("p_func", reuse=True) as scope:
self.p_t = _build_p_network(registry, obs_t, dim_actions, config)
# target p network evaluation
with tf.variable_scope("target_p_func") as scope:
self.p_tp1 = _build_p_network(registry, obs_tp1, dim_actions,
config)
self.target_p_func_vars = _scope_vars(scope.name)
# Action outputs
with tf.variable_scope("a_func", reuse=True):
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
zero_eps = tf.constant(value=.0, dtype=tf.float32)
output_actions = _build_action_network(
self.p_t, low_action, high_action, deterministic_flag,
zero_eps, config["exploration_theta"],
config["exploration_sigma"])
output_actions_estimated = _build_action_network(
self.p_tp1, low_action, high_action, deterministic_flag,
zero_eps, config["exploration_theta"],
config["exploration_sigma"])
# q network evaluation
with tf.variable_scope("q_func") as scope:
self.q_t = _build_q_network(registry, obs_t, act_t, config)
self.q_func_vars = _scope_vars(scope.name)
with tf.variable_scope("q_func", reuse=True):
self.q_tp0 = _build_q_network(registry, obs_t, output_actions,
config)
# target q network evalution
with tf.variable_scope("target_q_func") as scope:
self.q_tp1 = _build_q_network(registry, obs_tp1,
output_actions_estimated, config)
self.target_q_func_vars = _scope_vars(scope.name)
q_t_selected = tf.squeeze(self.q_t, axis=len(self.q_t.shape) - 1)
q_tp1_best = tf.squeeze(
input=self.q_tp1, axis=len(self.q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (
rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
if config.get("use_huber"):
errors = _huber_loss(self.td_error, config.get("huber_threshold"))
else:
errors = 0.5 * tf.square(self.td_error)
weighted_error = tf.reduce_mean(importance_weights * errors)
self.loss = weighted_error
# for policy gradient
self.actor_loss = -1.0 * tf.reduce_mean(self.q_tp0)
class DDPGGraph(object):
def __init__(self, registry, env, config, logdir):
self.env = env
dim_actions = env.action_space.shape[0]
low_action = env.action_space.low
high_action = env.action_space.high
actor_optimizer = tf.train.AdamOptimizer(
learning_rate=config["actor_lr"])
critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["critic_lr"])
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None, ) + env.observation_space.shape)
# Actor: P (policy) network
p_scope_name = "p_func"
with tf.variable_scope(p_scope_name) as scope:
p_values = _build_p_network(registry, self.cur_observations,
dim_actions, config)
p_func_vars = _scope_vars(scope.name)
# Action outputs
a_scope_name = "a_func"
with tf.variable_scope(a_scope_name):
self.output_actions = _build_action_network(
p_values, low_action, high_action, self.stochastic, self.eps,
config["exploration_theta"], config["exploration_sigma"])
with tf.variable_scope(a_scope_name, reuse=True):
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
self.reset_noise_op = tf.assign(exploration_sample,
dim_actions * [.0])
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32,
shape=(None, ) + env.observation_space.shape,
name="observation")
self.act_t = tf.placeholder(
tf.float32, shape=(None, ) + env.action_space.shape, name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None, ) + env.observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
def build_loss(obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
return ModelAndLoss(registry, dim_actions, low_action, high_action,
config, obs_t, act_t, rew_t, obs_tp1,
done_mask, importance_weights)
self.loss_inputs = [
("obs", self.obs_t),
("actions", self.act_t),
("rewards", self.rew_t),
("new_obs", self.obs_tp1),
("dones", self.done_mask),
("weights", self.importance_weights),
]
loss_obj = build_loss(self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
self.done_mask, self.importance_weights)
self.build_loss = build_loss
actor_loss = loss_obj.actor_loss
weighted_error = loss_obj.loss
q_func_vars = loss_obj.q_func_vars
target_p_func_vars = loss_obj.target_p_func_vars
target_q_func_vars = loss_obj.target_q_func_vars
self.p_t = loss_obj.p_t
self.q_t = loss_obj.q_t
self.q_tp0 = loss_obj.q_tp0
self.q_tp1 = loss_obj.q_tp1
self.td_error = loss_obj.td_error
if config["l2_reg"] is not None:
for var in p_func_vars:
if "bias" not in var.name:
actor_loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)
for var in q_func_vars:
if "bias" not in var.name:
weighted_error += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
var)
# compute optimization op (potentially with gradient clipping)
if config["grad_norm_clipping"] is not None:
self.actor_grads_and_vars = _minimize_and_clip(
actor_optimizer,
actor_loss,
var_list=p_func_vars,
clip_val=config["grad_norm_clipping"])
self.critic_grads_and_vars = _minimize_and_clip(
critic_optimizer,
weighted_error,
var_list=q_func_vars,
clip_val=config["grad_norm_clipping"])
else:
self.actor_grads_and_vars = actor_optimizer.compute_gradients(
actor_loss, var_list=p_func_vars)
self.critic_grads_and_vars = critic_optimizer.compute_gradients(
weighted_error, var_list=q_func_vars)
self.actor_grads_and_vars = [(g, v)
for (g, v) in self.actor_grads_and_vars
if g is not None]
self.critic_grads_and_vars = [(g, v)
for (g, v) in self.critic_grads_and_vars
if g is not None]
self.grads_and_vars = (
self.actor_grads_and_vars + self.critic_grads_and_vars)
self.grads = [g for (g, v) in self.grads_and_vars]
self.actor_train_expr = actor_optimizer.apply_gradients(
self.actor_grads_and_vars)
self.critic_train_expr = critic_optimizer.apply_gradients(
self.critic_grads_and_vars)
# update_target_fn will be called periodically to copy Q network to
# target Q network
self.tau_value = config.get("tau")
self.tau = tf.placeholder(tf.float32, (), name="tau")
update_target_expr = []
for var, var_target in zip(
sorted(q_func_vars, key=lambda v: v.name),
sorted(target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
for var, var_target in zip(
sorted(p_func_vars, key=lambda v: v.name),
sorted(target_p_func_vars, key=lambda v: v.name)):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
self.update_target_expr = tf.group(*update_target_expr)
# support both hard and soft sync
def update_target(self, sess, tau=None):
return sess.run(
self.update_target_expr,
feed_dict={self.tau: tau or self.tau_value})
def act(self, sess, obs, eps, stochastic=True):
return sess.run(
self.output_actions,
feed_dict={
self.cur_observations: obs,
self.stochastic: stochastic,
self.eps: eps
})
def compute_gradients(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err, grads = sess.run(
[self.td_error, self.grads],
feed_dict={
self.obs_t: obs_t,
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: obs_tp1,
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err, grads
def compute_td_error(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err = sess.run(
self.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def apply_gradients(self, sess, grads):
assert len(grads) == len(self.grads_and_vars)
feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
sess.run(
[self.critic_train_expr, self.actor_train_expr],
feed_dict=feed_dict)
def compute_apply(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err, _, _ = sess.run(
[self.td_error, self.critic_train_expr, self.actor_train_expr],
feed_dict={
self.obs_t: obs_t,
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: obs_tp1,
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def reset_noise(self, sess):
sess.run(self.reset_noise_op)
+2 -4
View File
@@ -9,7 +9,7 @@ from ray.rllib.ddpg2.models import DDPGModel
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.optimizers import PolicyEvaluator
from ray.rllib.utils.filter import NoFilter
from ray.rllib.utils.process_rollout import process_rollout
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.sampler import SyncSampler
@@ -34,9 +34,7 @@ class DDPGEvaluator(PolicyEvaluator):
# since each sample is one step, no discounting needs to be applied;
# this does not involve config["gamma"]
samples = process_rollout(
rollout, NoFilter(),
gamma=1.0, use_gae=False)
samples = compute_advantages(rollout, 0.0, gamma=1.0, use_gae=False)
return samples
+5 -2
View File
@@ -227,7 +227,7 @@ class DDPGActorCritic():
self.critic_vars.set_weights(critic_weights)
self.actor_vars.set_weights(actor_weights)
def compute(self, ob):
def compute_single_action(self, ob, h, is_training):
"""Returns action, given state."""
flattened_ob = np.reshape(ob, [-1, np.prod(ob.shape)])
action = self.sess.run(self.output_action, {self.obs: flattened_ob})
@@ -235,7 +235,10 @@ class DDPGActorCritic():
action += self.epsilon * self.rand_process.sample()
if (self.epsilon > 0):
self.epsilon -= self.config["noise_epsilon"]
return action[0], {}
return action[0], [], {}
def value(self, *args):
return 0
def get_initial_state(self):
return []
+18 -18
View File
@@ -9,26 +9,26 @@ from ray.utils import merge_dicts
APEX_DEFAULT_CONFIG = merge_dicts(
DQN_CONFIG,
{
'optimizer_class': 'ApexOptimizer',
'optimizer_config':
"optimizer_class": "ApexOptimizer",
"optimizer_config":
merge_dicts(
DQN_CONFIG['optimizer_config'], {
'max_weight_sync_delay': 400,
'num_replay_buffer_shards': 4,
'debug': False
DQN_CONFIG["optimizer_config"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
'n_step': 3,
'gpu': True,
'num_workers': 32,
'buffer_size': 2000000,
'learning_starts': 50000,
'train_batch_size': 512,
'sample_batch_size': 50,
'max_weight_sync_delay': 400,
'target_network_update_freq': 500000,
'timesteps_per_iteration': 25000,
'per_worker_exploration': True,
'worker_side_prioritization': True,
"n_step": 3,
"gpu": True,
"num_workers": 32,
"buffer_size": 2000000,
"learning_starts": 50000,
"train_batch_size": 512,
"sample_batch_size": 50,
"max_weight_sync_delay": 400,
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"per_worker_exploration": True,
"worker_side_prioritization": True,
},
)
+97 -117
View File
@@ -5,14 +5,13 @@ from __future__ import print_function
import pickle
import os
import numpy as np
import tensorflow as tf
import ray
from ray.rllib import optimizers
from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
from ray.rllib.dqn.dqn_policy_graph import DQNPolicyGraph
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
collect_metrics
from ray.rllib.agent import Agent
from ray.tune.result import TrainingResult
from ray.tune.trial import Resources
@@ -24,101 +23,84 @@ OPTIMIZER_SHARED_CONFIGS = [
DEFAULT_CONFIG = {
# === Model ===
# Whether to use dueling dqn
'dueling': True,
"dueling": True,
# Whether to use double dqn
'double_q': True,
"double_q": True,
# Hidden layer sizes of the state and action value networks
'hiddens': [256],
"hiddens": [256],
# N-step Q learning
'n_step': 1,
"n_step": 1,
# Config options to pass to the model constructor
'model': {},
"model": {},
# Discount factor for the MDP
'gamma': 0.99,
"gamma": 0.99,
# Arguments to pass to the env creator
'env_config': {},
"env_config": {},
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
# exploration_fraction
'schedule_max_timesteps': 100000,
"schedule_max_timesteps": 100000,
# Number of env steps to optimize for before returning
'timesteps_per_iteration': 1000,
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
'exploration_fraction': 0.1,
"exploration_fraction": 0.1,
# Final value of random action probability
'exploration_final_eps': 0.02,
"exploration_final_eps": 0.02,
# Update the target network every `target_network_update_freq` steps.
'target_network_update_freq': 500,
# Whether to start with random actions instead of noops.
'random_starts': True,
"target_network_update_freq": 500,
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
'buffer_size': 50000,
"buffer_size": 50000,
# If True prioritized replay buffer will be used.
'prioritized_replay': True,
"prioritized_replay": True,
# Alpha parameter for prioritized replay buffer.
'prioritized_replay_alpha': 0.6,
"prioritized_replay_alpha": 0.6,
# Beta parameter for sampling from prioritized replay buffer.
'prioritized_replay_beta': 0.4,
"prioritized_replay_beta": 0.4,
# Epsilon to add to the TD errors when updating priorities.
'prioritized_replay_eps': 1e-6,
"prioritized_replay_eps": 1e-6,
# Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
'clip_rewards': True,
"clip_rewards": True,
# === Optimization ===
# Learning rate for adam optimizer
'lr': 5e-4,
"lr": 5e-4,
# If not None, clip gradients during optimization at this value
'grad_norm_clipping': 40,
"grad_norm_clipping": 40,
# How many steps of the model to sample before learning starts.
'learning_starts': 1000,
"learning_starts": 1000,
# Update the replay buffer with this many samples at once. Note that
# this setting applies per-worker if num_workers > 1.
'sample_batch_size': 4,
"sample_batch_size": 4,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
'train_batch_size': 32,
# Smooth the current average reward over this many previous episodes.
'smoothing_num_episodes': 100,
# === Tensorflow ===
# Arguments to pass to tensorflow
'tf_session_args': {
"device_count": {"CPU": 2},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
},
"train_batch_size": 32,
# === Parallelism ===
# Whether to use a GPU for local optimization.
'gpu': False,
"gpu": False,
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you're using the Async or Ape-X optimizers.
'num_workers': 0,
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to allocate GPUs for workers (if > 0).
'num_gpus_per_worker': 0,
"num_gpus_per_worker": 0,
# Whether to allocate CPUs for workers (if > 0).
'num_cpus_per_worker': 1,
"num_cpus_per_worker": 1,
# Optimizer class to use.
'optimizer_class': "LocalSyncReplayOptimizer",
"optimizer_class": "LocalSyncReplayOptimizer",
# Config to pass to the optimizer.
'optimizer_config': {},
"optimizer_config": {},
# Whether to use a distribution of epsilons across workers for exploration.
'per_worker_exploration': False,
"per_worker_exploration": False,
# Whether to compute priorities on workers.
'worker_side_prioritization': False
"worker_side_prioritization": False
}
@@ -127,6 +109,7 @@ class DQNAgent(Agent):
_allow_unknown_subkeys = [
"model", "optimizer", "tf_session_args", "env_config"]
_default_config = DEFAULT_CONFIG
_policy_graph = DQNPolicyGraph
@classmethod
def default_resource_request(cls, config):
@@ -137,16 +120,31 @@ class DQNAgent(Agent):
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
def _init(self):
self.local_evaluator = DQNEvaluator(
self.registry, self.env_creator, self.config, self.logdir, 0)
remote_cls = ray.remote(
adjusted_batch_size = (
self.config["sample_batch_size"] + self.config["n_step"] - 1)
self.local_evaluator = CommonPolicyEvaluator(
self.env_creator, self._policy_graph,
batch_steps=adjusted_batch_size,
batch_mode="pack_episodes", preprocessor_pref="deepmind",
compress_observations=True,
registry=self.registry, env_config=self.config["env_config"],
model_config=self.config["model"], policy_config=self.config)
remote_cls = CommonPolicyEvaluator.as_remote(
num_cpus=self.config["num_cpus_per_worker"],
num_gpus=self.config["num_gpus_per_worker"])(
DQNEvaluator)
num_gpus=self.config["num_gpus_per_worker"])
self.remote_evaluators = [
remote_cls.remote(
self.registry, self.env_creator, self.config, self.logdir,
i)
self.env_creator, self._policy_graph,
batch_steps=adjusted_batch_size,
batch_mode="pack_episodes", preprocessor_pref="deepmind",
compress_observations=True,
registry=self.registry, env_config=self.config["env_config"],
model_config=self.config["model"], policy_config=self.config)
for _ in range(self.config["num_workers"])]
self.exploration0 = self._make_exploration_schedule(0)
self.explorations = [
self._make_exploration_schedule(i)
for i in range(self.config["num_workers"])]
for k in OPTIMIZER_SHARED_CONFIGS:
@@ -157,10 +155,25 @@ class DQNAgent(Agent):
self.config["optimizer_config"], self.local_evaluator,
self.remote_evaluators)
self.saver = tf.train.Saver(max_to_keep=None)
self.last_target_update_ts = 0
self.num_target_updates = 0
def _make_exploration_schedule(self, worker_index):
# Use either a different `eps` per worker, or a linear schedule.
if self.config["per_worker_exploration"]:
assert self.config["num_workers"] > 1, \
"This requires multiple workers"
return ConstantSchedule(
0.4 ** (
1 + worker_index / float(
self.config["num_workers"] - 1) * 7))
return LinearSchedule(
schedule_timesteps=int(
self.config["exploration_fraction"] *
self.config["schedule_max_timesteps"]),
initial_p=1.0,
final_p=self.config["exploration_final_eps"])
@property
def global_timestep(self):
return self.optimizer.num_steps_sampled
@@ -168,7 +181,7 @@ class DQNAgent(Agent):
def update_target_if_needed(self):
if self.global_timestep - self.last_target_update_ts > \
self.config["target_network_update_freq"]:
self.local_evaluator.update_target()
self.local_evaluator.for_policy(lambda p: p.update_target())
self.last_target_update_ts = self.global_timestep
self.num_target_updates += 1
@@ -177,58 +190,25 @@ class DQNAgent(Agent):
while (self.global_timestep - start_timestep <
self.config["timesteps_per_iteration"]):
self.optimizer.step()
self.update_target_if_needed()
self.local_evaluator.set_global_timestep(self.global_timestep)
for e in self.remote_evaluators:
e.set_global_timestep.remote(self.global_timestep)
exp_vals = [self.exploration0.value(self.global_timestep)]
self.local_evaluator.for_policy(
lambda p: p.set_epsilon(exp_vals[0]))
for i, e in enumerate(self.remote_evaluators):
exp_val = self.explorations[i].value(self.global_timestep)
e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
exp_vals.append(exp_val)
return self._train_stats(start_timestep)
def _train_stats(self, start_timestep):
if self.remote_evaluators:
stats = ray.get([
e.stats.remote() for e in self.remote_evaluators])
else:
stats = self.local_evaluator.stats()
if not isinstance(stats, list):
stats = [stats]
mean_100ep_reward = 0.0
mean_100ep_length = 0.0
num_episodes = 0
explorations = []
if self.config["per_worker_exploration"]:
# Return stats from workers with the lowest 20% of exploration
test_stats = stats[-int(max(1, len(stats)*0.2)):]
else:
test_stats = stats
for s in test_stats:
mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
for s in stats:
num_episodes += s["num_episodes"]
explorations.append(s["exploration"])
opt_stats = self.optimizer.stats()
result = TrainingResult(
episode_reward_mean=mean_100ep_reward,
episode_len_mean=mean_100ep_length,
episodes_total=num_episodes,
timesteps_this_iter=self.global_timestep - start_timestep,
result = collect_metrics(
self.local_evaluator, self.remote_evaluators)
return result._replace(
info=dict({
"min_exploration": min(explorations),
"max_exploration": max(explorations),
"min_exploration": min(exp_vals),
"max_exploration": max(exp_vals),
"num_target_updates": self.num_target_updates,
}, **opt_stats))
return result
}, **self.optimizer.stats()))
def _stop(self):
# workaround for https://github.com/ray-project/ray/issues/1516
@@ -236,10 +216,8 @@ class DQNAgent(Agent):
ev.__ray_terminate__.remote()
def _save(self, checkpoint_dir):
checkpoint_path = self.saver.save(
self.local_evaluator.sess,
os.path.join(checkpoint_dir, "checkpoint"),
global_step=self.iteration)
checkpoint_path = os.path.join(
checkpoint_dir, "checkpoint-{}".format(self.iteration))
extra_data = [
self.local_evaluator.save(),
ray.get([e.save.remote() for e in self.remote_evaluators]),
@@ -250,7 +228,6 @@ class DQNAgent(Agent):
return checkpoint_path
def _restore(self, checkpoint_path):
self.saver.restore(self.local_evaluator.sess, checkpoint_path)
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
self.local_evaluator.restore(extra_data[0])
ray.get([
@@ -260,6 +237,9 @@ class DQNAgent(Agent):
self.num_target_updates = extra_data[3]
self.last_target_update_ts = extra_data[4]
def compute_action(self, observation):
return self.local_evaluator.dqn_graph.act(
self.local_evaluator.sess, np.array(observation)[None], 0.0)[0]
def compute_action(self, observation, state=None):
if state is None:
state = []
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
observation, state, is_training=False)[0])
-207
View File
@@ -1,207 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Discrete
import numpy as np
import tensorflow as tf
import ray
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.dqn import models
from ray.rllib.dqn.common.wrappers import wrap_dqn
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
from ray.rllib.utils.compression import pack
def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
"""Rewrites the given trajectory fragments to encode n-step rewards.
reward[i] = (
reward[i] * gamma**0 +
reward[i+1] * gamma**1 +
... +
reward[i+n_step-1] * gamma**(n_step-1))
The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
If the episode finishes, the reward will be truncated. After this rewrite,
all the arrays will be shortened by (n_step - 1).
"""
for i in range(len(rewards) - n_step + 1):
if dones[i]:
continue # episode end
for j in range(1, n_step):
new_obs[i] = new_obs[i + j]
rewards[i] += gamma ** j * rewards[i + j]
if dones[i + j]:
break # episode end
# truncate ends of the trajectory
new_len = len(obs) - n_step + 1
for arr in [obs, actions, rewards, new_obs, dones]:
del arr[new_len:]
class DQNEvaluator(PolicyEvaluator):
"""The DQN Evaluator.
TODO(rliaw): Support observation/reward filters?"""
def __init__(self, registry, env_creator, config, logdir, worker_index):
env = env_creator(config["env_config"])
env = wrap_dqn(registry, env, config["model"], config["random_starts"])
self.env = env
self.config = config
if not isinstance(env.action_space, Discrete):
raise UnsupportedSpaceException(
"Action space {} is not supported for DQN.".format(
env.action_space))
tf_config = tf.ConfigProto(**config["tf_session_args"])
self.sess = tf.Session(config=tf_config)
self.dqn_graph = models.DQNGraph(registry, env, config, logdir)
# Use either a different `eps` per worker, or a linear schedule.
if config["per_worker_exploration"]:
assert config["num_workers"] > 1, "This requires multiple workers"
self.exploration = ConstantSchedule(
0.4 ** (
1 + worker_index / float(config["num_workers"] - 1) * 7))
else:
self.exploration = LinearSchedule(
schedule_timesteps=int(
config["exploration_fraction"] *
config["schedule_max_timesteps"]),
initial_p=1.0,
final_p=config["exploration_final_eps"])
# Initialize the parameters and copy them to the target network.
self.sess.run(tf.global_variables_initializer())
self.dqn_graph.update_target(self.sess)
self.global_timestep = 0
self.local_timestep = 0
# Note that this encompasses both the Q and target network
self.variables = ray.experimental.TensorFlowVariables(
tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)
self.episode_rewards = [0.0]
self.episode_lengths = [0.0]
self.saved_mean_reward = None
self.obs = self.env.reset()
def set_global_timestep(self, global_timestep):
self.global_timestep = global_timestep
def update_target(self):
self.dqn_graph.update_target(self.sess)
def sample(self):
obs, actions, rewards, new_obs, dones = [], [], [], [], []
for _ in range(
self.config["sample_batch_size"] + self.config["n_step"] - 1):
ob, act, rew, ob1, done = self._step(self.global_timestep)
obs.append(ob)
actions.append(act)
rewards.append(rew)
new_obs.append(ob1)
dones.append(done)
# N-step Q adjustments
if self.config["n_step"] > 1:
# Adjust for steps lost from truncation
self.local_timestep -= (self.config["n_step"] - 1)
adjust_nstep(
self.config["n_step"], self.config["gamma"],
obs, actions, rewards, new_obs, dones)
batch = SampleBatch({
"obs": [pack(np.array(o)) for o in obs], "actions": actions,
"rewards": rewards,
"new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
"weights": np.ones_like(rewards)})
assert (batch.count == self.config["sample_batch_size"])
# Prioritize on the worker side
if self.config["worker_side_prioritization"]:
td_errors = self.dqn_graph.compute_td_error(
self.sess, obs, batch["actions"], batch["rewards"],
new_obs, batch["dones"], batch["weights"])
new_priorities = (
np.abs(td_errors) + self.config["prioritized_replay_eps"])
batch.data["weights"] = new_priorities
return batch
def compute_gradients(self, samples):
td_err, grads = self.dqn_graph.compute_gradients(
self.sess, samples["obs"], samples["actions"], samples["rewards"],
samples["new_obs"], samples["dones"], samples["weights"])
return grads, {"td_error": td_err}
def apply_gradients(self, grads):
self.dqn_graph.apply_gradients(self.sess, grads)
def compute_apply(self, samples):
td_error = self.dqn_graph.compute_apply(
self.sess, samples["obs"], samples["actions"], samples["rewards"],
samples["new_obs"], samples["dones"], samples["weights"])
return {"td_error": td_error}
def get_weights(self):
return self.variables.get_weights()
def set_weights(self, weights):
self.variables.set_weights(weights)
def _step(self, global_timestep):
"""Takes a single step, and returns the result of the step."""
action = self.dqn_graph.act(
self.sess, np.array(self.obs)[None],
self.exploration.value(global_timestep))[0]
new_obs, rew, done, _ = self.env.step(action)
ret = (self.obs, action, rew, new_obs, float(done))
self.obs = new_obs
self.episode_rewards[-1] += rew
self.episode_lengths[-1] += 1
if done:
self.obs = self.env.reset()
self.episode_rewards.append(0.0)
self.episode_lengths.append(0.0)
self.local_timestep += 1
return ret
def stats(self):
n = self.config["smoothing_num_episodes"] + 1
mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
exploration = self.exploration.value(self.global_timestep)
return {
"mean_100ep_reward": mean_100ep_reward,
"mean_100ep_length": mean_100ep_length,
"num_episodes": len(self.episode_rewards),
"exploration": exploration,
"local_timestep": self.local_timestep,
}
def save(self):
return [
self.exploration,
self.episode_rewards,
self.episode_lengths,
self.saved_mean_reward,
self.obs,
self.global_timestep,
self.local_timestep]
def restore(self, data):
self.exploration = data[0]
self.episode_rewards = data[1]
self.episode_lengths = data[2]
self.saved_mean_reward = data[3]
self.obs = data[4]
self.global_timestep = data[5]
self.local_timestep = data[6]
@@ -2,13 +2,240 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Discrete
import numpy as np
import tensorflow as tf
import tensorflow.contrib.layers as layers
from ray.rllib.models import ModelCatalog
from ray.rllib.optimizers.multi_gpu_impl import TOWER_SCOPE_NAME
from ray.rllib.optimizers.sample_batch import SampleBatch
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
"""Rewrites the given trajectory fragments to encode n-step rewards.
reward[i] = (
reward[i] * gamma**0 +
reward[i+1] * gamma**1 +
... +
reward[i+n_step-1] * gamma**(n_step-1))
The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
If the episode finishes, the reward will be truncated. After this rewrite,
all the arrays will be shortened by (n_step - 1).
"""
for i in range(len(rewards) - n_step + 1):
if dones[i]:
continue # episode end
for j in range(1, n_step):
new_obs[i] = new_obs[i + j]
rewards[i] += gamma ** j * rewards[i + j]
if dones[i + j]:
break # episode end
# truncate ends of the trajectory
new_len = len(obs) - n_step + 1
for arr in [obs, actions, rewards, new_obs, dones]:
del arr[new_len:]
class DQNPolicyGraph(TFPolicyGraph):
def __init__(self, observation_space, action_space, registry, config):
if not isinstance(action_space, Discrete):
raise UnsupportedSpaceException(
"Action space {} is not supported for DQN.".format(
action_space))
self.config = config
self.cur_epsilon = 1.0
num_actions = action_space.n
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
# Action Q network
with tf.variable_scope(Q_SCOPE) as scope:
q_values = _build_q_network(
registry, self.cur_observations, num_actions, config)
self.q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = _build_action_network(
q_values,
self.cur_observations,
num_actions,
self.stochastic,
self.eps)
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
self.act_t = tf.placeholder(tf.int32, [None], name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
# q network evaluation
with tf.variable_scope(Q_SCOPE, reuse=True):
q_t = _build_q_network(
registry, self.obs_t, num_actions, config)
# target q network evalution
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
q_tp1 = _build_q_network(
registry, self.obs_tp1, num_actions, config)
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(
q_t * tf.one_hot(self.act_t, num_actions), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
with tf.variable_scope(Q_SCOPE, reuse=True):
q_tp1_using_online_net = _build_q_network(
registry, self.obs_tp1, num_actions, config)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
q_tp1 * tf.one_hot(
q_tp1_best_using_online_net, num_actions), 1)
else:
q_tp1_best = tf.reduce_max(q_tp1, 1)
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (
self.rew_t +
config["gamma"] ** config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
self.loss = tf.reduce_mean(
self.importance_weights * _huber_loss(self.td_error))
# update_target_fn will be called periodically to copy Q network to
# target Q network
update_target_expr = []
for var, var_target in zip(
sorted(self.q_func_vars, key=lambda v: v.name),
sorted(self.target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(var_target.assign(var))
self.update_target_expr = tf.group(*update_target_expr)
# initialize TFPolicyGraph
self.sess = tf.get_default_session()
self.loss_inputs = [
("obs", self.obs_t),
("actions", self.act_t),
("rewards", self.rew_t),
("new_obs", self.obs_tp1),
("dones", self.done_mask),
("weights", self.importance_weights),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, self.sess, obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss,
loss_inputs=self.loss_inputs, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
def optimizer(self):
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
grads_and_vars = _minimize_and_clip(
optimizer, self.loss, var_list=self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
grads_and_vars = optimizer.compute_gradients(
self.loss, var_list=self.q_func_vars)
grads_and_vars = [
(g, v) for (g, v) in grads_and_vars if g is not None]
return grads_and_vars
def extra_compute_action_feed_dict(self):
return {
self.stochastic: True,
self.eps: self.cur_epsilon,
}
def extra_compute_grad_fetches(self):
return {
"td_error": self.td_error,
}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return _postprocess_dqn(self, sample_batch)
def compute_td_error(
self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
td_err = self.sess.run(
self.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def update_target(self):
return self.sess.run(self.update_target_expr)
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
def get_state(self):
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
def set_state(self, state):
TFPolicyGraph.set_state(self, state[0])
self.set_epsilon(state[1])
def _postprocess_dqn(policy_graph, sample_batch):
obs, actions, rewards, new_obs, dones = [
list(x) for x in sample_batch.columns(
["obs", "actions", "rewards", "new_obs", "dones"])]
# N-step Q adjustments
if policy_graph.config["n_step"] > 1:
adjust_nstep(
policy_graph.config["n_step"], policy_graph.config["gamma"],
obs, actions, rewards, new_obs, dones)
batch = SampleBatch({
"obs": obs, "actions": actions, "rewards": rewards,
"new_obs": new_obs, "dones": dones,
"weights": np.ones_like(rewards)})
assert batch.count == policy_graph.config["sample_batch_size"], \
(batch.count, policy_graph.config["sample_batch_size"])
# Prioritize on the worker side
if policy_graph.config["worker_side_prioritization"]:
td_errors = policy_graph.compute_td_error(
batch["obs"], batch["actions"], batch["rewards"],
batch["new_obs"], batch["dones"], batch["weights"])
new_priorities = (
np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
batch.data["weights"] = new_priorities
return batch
def _build_q_network(registry, inputs, num_actions, config):
@@ -98,205 +325,3 @@ def _scope_vars(scope, trainable_only=False):
tf.GraphKeys.TRAINABLE_VARIABLES
if trainable_only else tf.GraphKeys.VARIABLES,
scope=scope if isinstance(scope, str) else scope.name)
class ModelAndLoss(object):
"""Holds the model and loss function.
Both graphs are necessary in order for the multi-gpu SGD implementation
to create towers on each device.
"""
def __init__(
self, registry, num_actions, config,
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
# q network evaluation
with tf.variable_scope("q_func", reuse=True):
self.q_t = _build_q_network(registry, obs_t, num_actions, config)
# target q network evalution
with tf.variable_scope("target_q_func") as scope:
self.q_tp1 = _build_q_network(
registry, obs_tp1, num_actions, config)
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(
self.q_t * tf.one_hot(act_t, num_actions), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
with tf.variable_scope("q_func", reuse=True):
q_tp1_using_online_net = _build_q_network(
registry, obs_tp1, num_actions, config)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
self.q_tp1 * tf.one_hot(
q_tp1_best_using_online_net, num_actions), 1)
else:
q_tp1_best = tf.reduce_max(self.q_tp1, 1)
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (
rew_t + config["gamma"] ** config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
errors = _huber_loss(self.td_error)
weighted_error = tf.reduce_mean(importance_weights * errors)
self.loss = weighted_error
class DQNGraph(object):
def __init__(self, registry, env, config, logdir):
self.env = env
num_actions = env.action_space.n
optimizer = tf.train.AdamOptimizer(learning_rate=config["lr"])
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None,) + env.observation_space.shape)
# Action Q network
q_scope_name = TOWER_SCOPE_NAME + "/q_func"
with tf.variable_scope(q_scope_name) as scope:
q_values = _build_q_network(
registry, self.cur_observations, num_actions, config)
q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = _build_action_network(
q_values,
self.cur_observations,
num_actions,
self.stochastic,
self.eps)
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32, shape=(None,) + env.observation_space.shape)
self.act_t = tf.placeholder(tf.int32, [None], name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None,) + env.observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
def build_loss(
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
return ModelAndLoss(
registry,
num_actions, config,
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights)
self.loss_inputs = [
("obs", self.obs_t),
("actions", self.act_t),
("rewards", self.rew_t),
("new_obs", self.obs_tp1),
("dones", self.done_mask),
("weights", self.importance_weights),
]
with tf.variable_scope(TOWER_SCOPE_NAME):
loss_obj = build_loss(
self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
self.done_mask, self.importance_weights)
self.build_loss = build_loss
weighted_error = loss_obj.loss
target_q_func_vars = loss_obj.target_q_func_vars
self.q_t = loss_obj.q_t
self.q_tp1 = loss_obj.q_tp1
self.td_error = loss_obj.td_error
# compute optimization op (potentially with gradient clipping)
if config["grad_norm_clipping"] is not None:
self.grads_and_vars = _minimize_and_clip(
optimizer, weighted_error, var_list=q_func_vars,
clip_val=config["grad_norm_clipping"])
else:
self.grads_and_vars = optimizer.compute_gradients(
weighted_error, var_list=q_func_vars)
self.grads_and_vars = [
(g, v) for (g, v) in self.grads_and_vars if g is not None]
self.grads = [g for (g, v) in self.grads_and_vars]
self.train_expr = optimizer.apply_gradients(self.grads_and_vars)
# update_target_fn will be called periodically to copy Q network to
# target Q network
update_target_expr = []
for var, var_target in zip(
sorted(q_func_vars, key=lambda v: v.name),
sorted(target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(var_target.assign(var))
self.update_target_expr = tf.group(*update_target_expr)
def update_target(self, sess):
return sess.run(self.update_target_expr)
def act(self, sess, obs, eps, stochastic=True):
return sess.run(
self.output_actions,
feed_dict={
self.cur_observations: obs,
self.stochastic: stochastic,
self.eps: eps,
})
def compute_gradients(
self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err, grads = sess.run(
[self.td_error, self.grads],
feed_dict={
self.obs_t: obs_t,
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: obs_tp1,
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err, grads
def compute_td_error(
self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err = sess.run(
self.td_error,
feed_dict={
self.obs_t: [np.array(ob) for ob in obs_t],
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
def apply_gradients(self, sess, grads):
assert len(grads) == len(self.grads_and_vars)
feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
sess.run(self.train_expr, feed_dict=feed_dict)
def compute_apply(
self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err, _ = sess.run(
[self.td_error, self.train_expr],
feed_dict={
self.obs_t: obs_t,
self.act_t: act_t,
self.rew_t: rew_t,
self.obs_tp1: obs_tp1,
self.done_mask: done_mask,
self.importance_weights: importance_weights
})
return td_err
+2 -2
View File
@@ -35,8 +35,8 @@ class LSTM(Model):
lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
step_size = tf.shape(self.x)[:1]
c_init = np.zeros((1, lstm.state_size.c), np.float32)
h_init = np.zeros((1, lstm.state_size.h), np.float32)
c_init = np.zeros(lstm.state_size.c, np.float32)
h_init = np.zeros(lstm.state_size.h, np.float32)
self.state_init = [c_init, h_init]
c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
+2 -6
View File
@@ -7,18 +7,14 @@ import numpy as np
import torch
def convert_batch(trajectory, has_features=False):
def convert_batch(trajectory):
"""Convert trajectory from numpy to PT variable"""
states = torch.from_numpy(trajectory["obs"]).float()
acs = torch.from_numpy(trajectory["actions"])
advs = torch.from_numpy(
trajectory["advantages"].copy()).float().reshape(-1)
rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
if has_features:
features = [torch.from_numpy(f) for f in trajectory["features"]]
else:
features = trajectory["features"]
return states, acs, advs, rs, features
return states, acs, advs, rs
def var_to_np(var):
@@ -43,7 +43,7 @@ class LocalSyncParallelOptimizer(object):
processed.
build_loss: Function that takes the specified inputs and returns an
object with a 'loss' property that is a scalar Tensor. For example,
ray.rllib.ppo.ProximalPolicyLoss.
ray.rllib.ppo.ProximalPolicyGraph.
logdir: Directory to place debugging output in.
grad_norm_clipping: None or int stdev to clip grad norms by
"""
@@ -38,18 +38,24 @@ class PolicyOptimizer(object):
Args:
evaluator_cls (class): Python class of the evaluators to create.
evaluator_args (list): List of constructor args for the evaluators.
evaluator_args (list|dict): Constructor args for the evaluators.
num_workers (int): Number of remote evaluators to create in
addition to a local evaluator. This can be zero or greater.
optimizer_config (dict): Keyword arguments to pass to the
optimizer class constructor.
"""
local_evaluator = evaluator_cls(*evaluator_args)
remote_cls = ray.remote(**evaluator_resources)(evaluator_cls)
remote_evaluators = [
remote_cls.remote(*evaluator_args)
for _ in range(num_workers)]
if isinstance(evaluator_args, list):
local_evaluator = evaluator_cls(*evaluator_args)
remote_evaluators = [
remote_cls.remote(*evaluator_args)
for _ in range(num_workers)]
else:
local_evaluator = evaluator_cls(**evaluator_args)
remote_evaluators = [
remote_cls.remote(**evaluator_args)
for _ in range(num_workers)]
return cls(optimizer_config, local_evaluator, remote_evaluators)
def __init__(self, config, local_evaluator, remote_evaluators):
+13 -8
View File
@@ -2,17 +2,22 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
import numpy as np
def arrayify(s):
if type(s) in [int, float, str, np.ndarray]:
return s
elif type(s) is list:
# recursive call to convert LazyFrames to arrays
return np.array([arrayify(x) for x in s])
else:
return np.array(s)
class SampleBatchBuilder(object):
"""Util to build a SampleBatch incrementally."""
def __init__(self):
self.buffers = collections.defaultdict(list)
def add_values(self, **values):
for k, v in values.items():
self.buffers[k].append(v)
def build(self):
return SampleBatch({k: np.array(v) for k, v in self.buffers.items()})
class SampleBatch(object):
+23 -32
View File
@@ -2,13 +2,11 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import ray
from ray.rllib.optimizers import LocalSyncOptimizer
from ray.rllib.pg.pg_evaluator import PGEvaluator
from ray.rllib.agent import Agent
from ray.tune.result import TrainingResult
from ray.rllib.optimizers import LocalSyncOptimizer
from ray.rllib.pg.pg_policy_graph import PGPolicyGraph
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
collect_metrics
from ray.tune.trial import Resources
@@ -33,7 +31,6 @@ DEFAULT_CONFIG = {
class PGAgent(Agent):
"""Simple policy gradient agent.
This is an example agent to show how to implement algorithms in RLlib.
@@ -50,34 +47,28 @@ class PGAgent(Agent):
def _init(self):
self.optimizer = LocalSyncOptimizer.make(
evaluator_cls=PGEvaluator,
evaluator_args=[self.registry, self.env_creator, self.config],
evaluator_cls=CommonPolicyEvaluator,
evaluator_args={
"env_creator": self.env_creator,
"policy_graph": PGPolicyGraph,
"batch_steps": self.config["batch_size"],
"batch_mode": "truncate_episodes",
"registry": self.registry,
"model_config": self.config["model"],
"env_config": self.config["env_config"],
"policy_config": self.config,
},
num_workers=self.config["num_workers"],
optimizer_config=self.config["optimizer"])
def _train(self):
self.optimizer.step()
return collect_metrics(
self.optimizer.local_evaluator, self.optimizer.remote_evaluators)
episode_rewards = []
episode_lengths = []
metric_lists = [a.get_completed_rollout_metrics.remote()
for a in self.optimizer.remote_evaluators]
for metrics in metric_lists:
for episode in ray.get(metrics):
episode_lengths.append(episode.episode_length)
episode_rewards.append(episode.episode_reward)
avg_reward = np.mean(episode_rewards)
avg_length = np.mean(episode_lengths)
timesteps = np.sum(episode_lengths)
result = TrainingResult(
episode_reward_mean=avg_reward,
episode_len_mean=avg_length,
timesteps_this_iter=timesteps,
info={})
return result
def compute_action(self, obs):
action, info = self.optimizer.local_evaluator.policy.compute(obs)
return action
def compute_action(self, observation, state=None):
if state is None:
state = []
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
observation, state, is_training=False)[0])
-56
View File
@@ -1,56 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.optimizers import PolicyEvaluator
from ray.rllib.pg.policy import PGPolicy
from ray.rllib.utils.filter import NoFilter
from ray.rllib.utils.process_rollout import process_rollout
from ray.rllib.utils.sampler import SyncSampler
class PGEvaluator(PolicyEvaluator):
"""Evaluator for simple policy gradient."""
def __init__(self, registry, env_creator, config):
self.env = ModelCatalog.get_preprocessor_as_wrapper(
registry, env_creator(config["env_config"]), config["model"])
self.config = config
self.policy = PGPolicy(registry, self.env.observation_space,
self.env.action_space, config)
self.sampler = SyncSampler(
self.env, self.policy, NoFilter(),
config["batch_size"], horizon=config["horizon"])
def sample(self):
rollout = self.sampler.get_data()
samples = process_rollout(
rollout, NoFilter(),
gamma=self.config["gamma"], use_gae=False)
return samples
def get_completed_rollout_metrics(self):
"""Returns metrics on previously completed rollouts.
Calling this clears the queue of completed rollout metrics.
"""
return self.sampler.get_metrics()
def compute_gradients(self, samples):
""" Returns gradient w.r.t. samples."""
gradient, info = self.policy.compute_gradients(samples)
return gradient, {}
def apply_gradients(self, grads):
"""Applies gradients to evaluator weights."""
self.policy.apply_gradients(grads)
def get_weights(self):
"""Returns model weights."""
return self.policy.get_weights()
def set_weights(self, weights):
"""Sets model weights."""
return self.policy.set_weights(weights)
+45
View File
@@ -0,0 +1,45 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
class PGPolicyGraph(TFPolicyGraph):
def __init__(self, obs_space, action_space, registry, config):
self.config = config
# setup policy
self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(action_space)
self.model = ModelCatalog.get_model(
registry, self.x, self.logit_dim, options=self.config["model"])
self.dist = dist_class(self.model.outputs) # logit for each action
# setup policy loss
self.ac = ModelCatalog.get_action_placeholder(action_space)
self.adv = tf.placeholder(tf.float32, [None], name="adv")
self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv)
# initialize TFPolicyGraph
self.sess = tf.get_default_session()
self.loss_in = [
("obs", self.x),
("actions", self.ac),
("advantages", self.adv),
]
self.is_training = tf.placeholder_with_default(True, ())
TFPolicyGraph.__init__(
self, self.sess, obs_input=self.x,
action_sampler=self.dist.sample(), loss=self.loss,
loss_inputs=self.loss_in, is_training=self.is_training)
self.sess.run(tf.global_variables_initializer())
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return compute_advantages(
sample_batch, 0.0, self.config["gamma"], use_gae=False)
-82
View File
@@ -1,82 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import ray
from ray.rllib.models.catalog import ModelCatalog
class PGPolicy():
other_output = []
is_recurrent = False
def __init__(self, registry, ob_space, ac_space, config):
self.config = config
self.registry = registry
with tf.variable_scope("local"):
self._setup_graph(ob_space, ac_space)
print("Setting up loss")
self._setup_loss(ac_space)
self._setup_gradients()
self.initialize()
def _setup_graph(self, ob_space, ac_space):
self.x = tf.placeholder(tf.float32, shape=[None]+list(ob_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
self.model = ModelCatalog.get_model(
self.registry, self.x, self.logit_dim,
options=self.config["model"])
self.action_logits = self.model.outputs # logit for each action
self.dist = dist_class(self.action_logits)
self.sample = self.dist.sample()
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
def _setup_loss(self, action_space):
self.ac = ModelCatalog.get_action_placeholder(action_space)
self.adv = tf.placeholder(tf.float32, [None], name="adv")
log_prob = self.dist.logp(self.ac)
# policy loss
self.loss = -tf.reduce_mean(log_prob * self.adv)
def _setup_gradients(self):
self.grads = tf.gradients(self.loss, self.var_list)
grads_and_vars = list(zip(self.grads, self.var_list))
opt = tf.train.AdamOptimizer(self.config["lr"])
self._apply_gradients = opt.apply_gradients(grads_and_vars)
def initialize(self):
self.sess = tf.Session()
self.variables = ray.experimental.TensorFlowVariables(
self.loss, self.sess)
self.sess.run(tf.global_variables_initializer())
def compute_gradients(self, samples):
info = {}
feed_dict = {
self.x: samples["obs"],
self.ac: samples["actions"],
self.adv: samples["advantages"],
}
self.grads = [g for g in self.grads if g is not None]
grad = self.sess.run(self.grads, feed_dict=feed_dict)
return grad, info
def apply_gradients(self, grads):
feed_dict = dict(zip(self.grads, grads))
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
def get_weights(self):
return self.variables.get_weights()
def set_weights(self, weights):
self.variables.set_weights(weights)
def compute(self, ob, *args):
action = self.sess.run(self.sample, {self.x: [ob]})
return action[0], {}
+6 -3
View File
@@ -7,7 +7,7 @@ import tensorflow as tf
from ray.rllib.models import ModelCatalog
class ProximalPolicyLoss(object):
class ProximalPolicyGraph(object):
other_output = ["vf_preds", "logprobs"]
is_recurrent = False
@@ -82,11 +82,14 @@ class ProximalPolicyLoss(object):
self.policy_results = [
self.sampler, self.curr_logits, tf.constant("NA")]
def compute(self, observation):
def compute_single_action(self, observation, features, is_training=False):
action, logprobs, vf = self.sess.run(
self.policy_results,
feed_dict={self.observations: [observation]})
return action[0], {"vf_preds": vf[0], "logprobs": logprobs[0]}
return action[0], [], {"vf_preds": vf[0], "logprobs": logprobs[0]}
def get_initial_state(self):
return []
def loss(self):
return self.loss
+7 -6
View File
@@ -172,7 +172,7 @@ class PPOAgent(Agent):
batch_index = 0
num_batches = (
int(tuples_per_device) // int(model.per_device_batch_size))
loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
loss, policy_graph, vf_loss, kl, entropy = [], [], [], [], []
permutation = np.random.permutation(num_batches)
# Prepare to drop into the debugger
if self.iteration == config["tf_debug_iteration"]:
@@ -181,26 +181,26 @@ class PPOAgent(Agent):
full_trace = (
i == 0 and self.iteration == 0 and
batch_index == config["full_trace_nth_sgd_batch"])
batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
batch_loss, batch_policy_graph, batch_vf_loss, batch_kl, \
batch_entropy = model.run_sgd_minibatch(
permutation[batch_index] * model.per_device_batch_size,
self.kl_coeff, full_trace,
self.file_writer)
loss.append(batch_loss)
policy_loss.append(batch_policy_loss)
policy_graph.append(batch_policy_graph)
vf_loss.append(batch_vf_loss)
kl.append(batch_kl)
entropy.append(batch_entropy)
batch_index += 1
loss = np.mean(loss)
policy_loss = np.mean(policy_loss)
policy_graph = np.mean(policy_graph)
vf_loss = np.mean(vf_loss)
kl = np.mean(kl)
entropy = np.mean(entropy)
sgd_end = time.time()
print(
"{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
i, loss, policy_loss, vf_loss, kl, entropy))
i, loss, policy_graph, vf_loss, kl, entropy))
values = []
if i == config["num_sgd_iter"] - 1:
@@ -299,4 +299,5 @@ class PPOAgent(Agent):
def compute_action(self, observation):
observation = self.local_evaluator.obs_filter(
observation, update=False)
return self.local_evaluator.common_policy.compute(observation)[0]
return self.local_evaluator.common_policy.compute_single_action(
observation, [], False)[0]
+6 -5
View File
@@ -16,8 +16,8 @@ from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.sampler import SyncSampler
from ray.rllib.utils.filter import get_filter, MeanStdFilter
from ray.rllib.utils.process_rollout import process_rollout
from ray.rllib.ppo.loss import ProximalPolicyLoss
from ray.rllib.utils.process_rollout import compute_advantages
from ray.rllib.ppo.loss import ProximalPolicyGraph
# TODO(rliaw): Move this onto LocalMultiGPUOptimizer
@@ -86,7 +86,7 @@ class PPOEvaluator(PolicyEvaluator):
self.per_device_batch_size = int(self.batch_size / len(devices))
def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
return ProximalPolicyLoss(
return ProximalPolicyGraph(
self.env.observation_space, self.env.action_space,
obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
self.kl_coeff, self.distribution_class, self.config,
@@ -190,8 +190,9 @@ class PPOEvaluator(PolicyEvaluator):
while num_steps_so_far < self.config["min_steps_per_task"]:
rollout = self.sampler.get_data()
samples = process_rollout(
rollout, self.rew_filter, self.config["gamma"],
last_r = 0.0 # note: not needed since we don't truncate rollouts
samples = compute_advantages(
rollout, last_r, self.config["gamma"],
self.config["lambda"], use_gae=self.config["use_gae"])
num_steps_so_far += samples.count
all_samples.append(samples)
@@ -17,18 +17,19 @@ def get_mean_action(alg, obs):
return np.mean(out)
ray.init()
ray.init(num_cpus=10)
CONFIGS = {
"ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100},
"ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
"num_workers": 2},
"DQN": {},
"DDPG": {"noise_scale": 0.0},
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000},
"A3C": {"use_lstm": False},
"DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
"A3C": {"use_lstm": False, "num_workers": 1},
}
def test(use_object_store, alg_name):
def test(use_object_store, alg_name, failures):
cls = get_agent_class(alg_name)
if alg_name == "DDPG":
alg1 = cls(config=CONFIGS[name], env="Pendulum-v0")
@@ -55,12 +56,15 @@ def test(use_object_store, alg_name):
a1 = get_mean_action(alg1, obs)
a2 = get_mean_action(alg2, obs)
print("Checking computed actions", alg1, obs, a1, a2)
assert abs(a1 - a2) < .1, (a1, a2)
if abs(a1 - a2) > .1:
failures.append((alg_name, [a1, a2]))
if __name__ == "__main__":
failures = []
for use_object_store in [False, True]:
for name in ["ES", "DQN", "DDPG", "PPO", "A3C"]:
test(use_object_store, name)
test(use_object_store, name, failures)
assert not failures, failures
print("All checkpoint restore tests passed!")
@@ -0,0 +1,133 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import time
import unittest
import ray
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
from ray.rllib.utils.policy_graph import PolicyGraph
from ray.rllib.utils.process_rollout import compute_advantages
class MockPolicyGraph(PolicyGraph):
def compute_actions(self, obs_batch, state_batches, is_training=False):
return [0] * len(obs_batch), [], {}
def postprocess_trajectory(self, batch):
return compute_advantages(batch, 100.0, 0.9, use_gae=False)
class TestCommonPolicyEvaluator(unittest.TestCase):
def testBasic(self):
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph)
batch = ev.sample()
for key in ["obs", "actions", "rewards", "dones", "advantages"]:
self.assertIn(key, batch)
self.assertGreater(batch["advantages"][0], 1)
def testPackEpisodes(self):
for batch_size in [1, 10, 100, 1000]:
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
batch_steps=batch_size,
batch_mode="pack_episodes")
batch = ev.sample()
self.assertEqual(batch.count, batch_size)
def testTruncateEpisodes(self):
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
batch_steps=2,
batch_mode="truncate_episodes")
batch = ev.sample()
self.assertEqual(batch.count, 2)
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
batch_steps=1000,
batch_mode="truncate_episodes")
self.assertLess(batch.count, 200)
def testCompleteEpisodes(self):
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
batch_steps=2,
batch_mode="complete_episodes")
batch = ev.sample()
self.assertGreater(batch.count, 2)
self.assertTrue(batch["dones"][-1])
batch = ev.sample()
self.assertGreater(batch.count, 2)
self.assertTrue(batch["dones"][-1])
def testFilterSync(self):
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
sample_async=True,
observation_filter="ConcurrentMeanStdFilter")
time.sleep(2)
ev.sample()
filters = ev.get_filters(flush_after=True)
obs_f = filters["obs_filter"]
self.assertNotEqual(obs_f.rs.n, 0)
self.assertNotEqual(obs_f.buffer.n, 0)
def testGetFilters(self):
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
sample_async=True,
observation_filter="ConcurrentMeanStdFilter")
self.sample_and_flush(ev)
filters = ev.get_filters(flush_after=False)
time.sleep(2)
filters2 = ev.get_filters(flush_after=False)
obs_f = filters["obs_filter"]
obs_f2 = filters2["obs_filter"]
self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
def testSyncFilter(self):
ev = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=MockPolicyGraph,
sample_async=True,
observation_filter="ConcurrentMeanStdFilter")
obs_f = self.sample_and_flush(ev)
# Current State
filters = ev.get_filters(flush_after=False)
obs_f = filters["obs_filter"]
self.assertLessEqual(obs_f.buffer.n, 20)
new_obsf = obs_f.copy()
new_obsf.rs._n = 100
ev.sync_filters({"obs_filter": new_obsf})
filters = ev.get_filters(flush_after=False)
obs_f = filters["obs_filter"]
self.assertGreaterEqual(obs_f.rs.n, 100)
self.assertLessEqual(obs_f.buffer.n, 20)
def sample_and_flush(self, ev):
time.sleep(2)
ev.sample()
filters = ev.get_filters(flush_after=True)
obs_f = filters["obs_filter"]
self.assertNotEqual(obs_f.rs.n, 0)
self.assertNotEqual(obs_f.buffer.n, 0)
return obs_f
if __name__ == '__main__':
ray.init()
unittest.main(verbosity=2)
+2 -75
View File
@@ -3,19 +3,11 @@ from __future__ import division
from __future__ import print_function
import unittest
import gym
import shutil
import tempfile
import time
import ray
from ray.rllib.a3c import DEFAULT_CONFIG
from ray.rllib.a3c.a3c_evaluator import A3CEvaluator
from ray.rllib.dqn.dqn_evaluator import adjust_nstep
from ray.tune.registry import get_registry
from ray.rllib.dqn.dqn_policy_graph import adjust_nstep
class DQNEvaluatorTest(unittest.TestCase):
class DQNTest(unittest.TestCase):
def testNStep(self):
obs = [1, 2, 3, 4, 5, 6, 7]
actions = ["a", "b", "a", "a", "a", "b", "a"]
@@ -30,70 +22,5 @@ class DQNEvaluatorTest(unittest.TestCase):
self.assertEqual(dones, [1, 0, 0, 0, 0])
class A3CEvaluatorTest(unittest.TestCase):
def setUp(self):
ray.init(num_cpus=1)
config = DEFAULT_CONFIG.copy()
config["num_workers"] = 1
config["observation_filter"] = "ConcurrentMeanStdFilter"
config["reward_filter"] = "MeanStdFilter"
config["batch_size"] = 2
self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test")
self.e = A3CEvaluator(
get_registry(),
lambda config: gym.make("CartPole-v0"),
config,
logdir=self._temp_dir)
def tearDown(self):
ray.worker.cleanup()
shutil.rmtree(self._temp_dir)
def sample_and_flush(self):
e = self.e
time.sleep(2)
self.e.sample()
filters = e.get_filters(flush_after=True)
obs_f = filters["obs_filter"]
rew_f = filters["rew_filter"]
self.assertNotEqual(obs_f.rs.n, 0)
self.assertNotEqual(obs_f.buffer.n, 0)
self.assertNotEqual(rew_f.rs.n, 0)
self.assertNotEqual(rew_f.buffer.n, 0)
return obs_f, rew_f
def testGetFilters(self):
"""Show `flush_after=False` provides does not affect the buffer."""
e = self.e
self.sample_and_flush()
filters = e.get_filters(flush_after=False)
obs_f = filters["obs_filter"]
filters2 = e.get_filters(flush_after=False)
obs_f2 = filters2["obs_filter"]
self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
def testSyncFilter(self):
"""Show that sync_filters rebases own buffer over input"""
e = self.e
obs_f, _ = self.sample_and_flush()
# Current State
filters = e.get_filters(flush_after=False)
obs_f = filters["obs_filter"]
rew_f = filters["rew_filter"]
self.assertLessEqual(obs_f.buffer.n, 20)
new_obsf = obs_f.copy()
new_obsf.rs._n = 100
e.sync_filters({"obs_filter": new_obsf, "rew_filter": rew_f})
filters = e.get_filters(flush_after=False)
obs_f = filters["obs_filter"]
self.assertGreaterEqual(obs_f.rs.n, 100)
self.assertLessEqual(obs_f.buffer.n, 20)
if __name__ == '__main__':
unittest.main(verbosity=2)
+2 -34
View File
@@ -36,32 +36,6 @@ OBSERVATION_SPACES_TO_TEST = {
Box(0.0, 1.0, (5,), dtype=np.float32)]),
}
# (alg, action_space, obs_space)
KNOWN_FAILURES = [
# TODO(ekl) multiagent support for a3c
("A3C", "implicit_tuple", "atari"),
("A3C", "implicit_tuple", "atari_ram"),
("A3C", "implicit_tuple", "discrete"),
("A3C", "implicit_tuple", "image"),
("A3C", "implicit_tuple", "mixed_tuple"),
("A3C", "implicit_tuple", "simple_tuple"),
("A3C", "implicit_tuple", "vector"),
("A3C", "mixed_tuple", "atari"),
("A3C", "mixed_tuple", "atari_ram"),
("A3C", "mixed_tuple", "discrete"),
("A3C", "mixed_tuple", "image"),
("A3C", "mixed_tuple", "mixed_tuple"),
("A3C", "mixed_tuple", "simple_tuple"),
("A3C", "mixed_tuple", "vector"),
("A3C", "simple_tuple", "atari"),
("A3C", "simple_tuple", "atari_ram"),
("A3C", "simple_tuple", "discrete"),
("A3C", "simple_tuple", "image"),
("A3C", "simple_tuple", "mixed_tuple"),
("A3C", "simple_tuple", "simple_tuple"),
("A3C", "simple_tuple", "vector"),
]
def make_stub_env(action_space, obs_space):
class StubEnv(gym.Env):
@@ -135,19 +109,13 @@ class ModelSupportedSpaces(unittest.TestCase):
{"num_workers": 1, "optimizer": {}},
stats)
num_unexpected_errors = 0
num_unexpected_success = 0
for (alg, a_name, o_name), stat in sorted(stats.items()):
if stat in ["ok", "unsupported"]:
if (alg, a_name, o_name) in KNOWN_FAILURES:
num_unexpected_success += 1
else:
if (alg, a_name, o_name) not in KNOWN_FAILURES:
num_unexpected_errors += 1
if stat not in ["ok", "unsupported"]:
num_unexpected_errors += 1
print(
alg, "action_space", a_name, "obs_space", o_name,
"result", stat)
self.assertEqual(num_unexpected_errors, 0)
self.assertEqual(num_unexpected_success, 0)
if __name__ == "__main__":
@@ -13,7 +13,6 @@ mountaincarcontinuous-ddpg:
tau: 0.01
l2_reg: 0.00001
buffer_size: 50000
random_starts: False
clip_rewards: False
learning_starts: 1000
#model:
@@ -6,6 +6,5 @@ pendulum-ddpg:
episode_reward_mean: -160
config:
use_huber: True
random_starts: False
clip_rewards: False
exploration_fraction: 0.1
@@ -0,0 +1,10 @@
cartpole-a3c:
env: CartPole-v0
run: A3C
stop:
episode_reward_mean: 200
time_total_s: 600
config:
num_workers: 1
gamma: 0.95
use_pytorch: true
@@ -5,5 +5,5 @@ cartpole-a3c:
episode_reward_mean: 200
time_total_s: 600
config:
num_workers: 4
num_workers: 1
gamma: 0.95
@@ -7,4 +7,3 @@ cartpole-dqn:
config:
n_step: 3
gamma: 0.95
smoothing_num_episodes: 10
@@ -0,0 +1,8 @@
cartpole-pg:
env: CartPole-v0
run: PG
stop:
episode_reward_mean: 200
time_total_s: 300
config:
num_workers: 1
@@ -6,7 +6,5 @@ pendulum-ddpg:
time_total_s: 900
config:
use_huber: True
random_starts: False
clip_rewards: False
exploration_fraction: 0.1
smoothing_num_episodes: 10
@@ -0,0 +1,278 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import pickle
import numpy as np
import tensorflow as tf
import ray
from ray.rllib.models import ModelCatalog
from ray.rllib.optimizers.policy_evaluator import PolicyEvaluator
from ray.rllib.utils.atari_wrappers import wrap_deepmind
from ray.rllib.utils.compression import pack
from ray.rllib.utils.filter import get_filter
from ray.rllib.utils.sampler import AsyncSampler, SyncSampler
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
from ray.tune.registry import get_registry
from ray.tune.result import TrainingResult
def collect_metrics(local_evaluator, remote_evaluators):
"""Gathers episode metrics from CommonPolicyEvaluator instances."""
episode_rewards = []
episode_lengths = []
metric_lists = ray.get(
[a.apply.remote(lambda ev: ev.sampler.get_metrics())
for a in remote_evaluators])
metric_lists.append(local_evaluator.sampler.get_metrics())
for metrics in metric_lists:
for episode in metrics:
episode_lengths.append(episode.episode_length)
episode_rewards.append(episode.episode_reward)
if episode_rewards:
min_reward = min(episode_rewards)
max_reward = max(episode_rewards)
else:
min_reward = float('nan')
max_reward = float('nan')
avg_reward = np.mean(episode_rewards)
avg_length = np.mean(episode_lengths)
timesteps = np.sum(episode_lengths)
return TrainingResult(
episode_reward_max=max_reward,
episode_reward_min=min_reward,
episode_reward_mean=avg_reward,
episode_len_mean=avg_length,
episodes_total=len(episode_lengths),
timesteps_this_iter=timesteps)
class CommonPolicyEvaluator(PolicyEvaluator):
"""Policy evaluator implementation that operates on a rllib.PolicyGraph.
TODO: vector env
TODO: multi-agent
TODO: consumer buffering for multi-agent
TODO: complete episode batch mode
Examples:
# Create a policy evaluator and using it to collect experiences.
>>> evaluator = CommonPolicyEvaluator(
env_creator=lambda _: gym.make("CartPole-v0"),
policy_graph=PGPolicyGraph)
>>> print(evaluator.sample().keys())
{"obs": [[...]], "actions": [[...]], "rewards": [[...]],
"dones": [[...]], "new_obs": [[...]]}
# Creating policy evaluators using optimizer_cls.make().
>>> optimizer = LocalSyncOptimizer.make(
evaluator_cls=CommonPolicyEvaluator,
evaluator_args={
"env_creator": lambda _: gym.make("CartPole-v0"),
"policy_graph": PGPolicyGraph,
},
num_workers=10)
>>> for _ in range(10): optimizer.step()
"""
@classmethod
def as_remote(cls, num_cpus=None, num_gpus=None):
return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
def __init__(
self,
env_creator,
policy_graph,
tf_session_creator=None,
batch_steps=100,
batch_mode="truncate_episodes",
preprocessor_pref="rllib",
sample_async=False,
compress_observations=False,
observation_filter="NoFilter",
registry=None,
env_config=None,
model_config=None,
policy_config=None):
"""Initialize a policy evaluator.
Arguments:
env_creator (func): Function that returns a gym.Env given an
env config dict.
policy_graph (class): A class implementing rllib.PolicyGraph or
rllib.TFPolicyGraph.
tf_session_creator (func): A function that returns a TF session.
This is optional and only useful with TFPolicyGraph.
batch_steps (int): The target number of env transitions to include
in each sample batch returned from this evaluator.
batch_mode (str): One of the following choices:
complete_episodes: each batch will be at least batch_steps
in size, and will include one or more complete episodes.
truncate_episodes: each batch will be around batch_steps
in size, and include transitions from one episode only.
pack_episodes: each batch will be exactly batch_steps in
size, and may include transitions from multiple episodes.
preprocessor_pref (str): Whether to prefer RLlib preprocessors
("rllib") or deepmind ("deepmind") when applicable.
sample_async (bool): Whether to compute samples asynchronously in
the background, which improves throughput but can cause samples
to be slightly off-policy.
compress_observations (bool): If true, compress the observations
returned.
observation_filter (str): Name of observation filter to use.
registry (tune.Registry): User-registered objects. Pass in the
value from tune.registry.get_registry() if you're having
trouble resolving things like custom envs.
env_config (dict): Config to pass to the env creator.
model_config (dict): Config to use when creating the policy model.
policy_config (dict): Config to pass to the policy.
"""
registry = registry or get_registry()
env_config = env_config or {}
policy_config = policy_config or {}
model_config = model_config or {}
assert batch_mode in [
"complete_episodes", "truncate_episodes", "pack_episodes"]
self.env_creator = env_creator
self.policy_graph = policy_graph
self.batch_steps = batch_steps
self.batch_mode = batch_mode
self.compress_observations = compress_observations
self.env = env_creator(env_config)
is_atari = hasattr(self.env.unwrapped, "ale")
if is_atari and "custom_preprocessor" not in model_config and \
preprocessor_pref == "deepmind":
self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
else:
self.env = ModelCatalog.get_preprocessor_as_wrapper(
registry, self.env, model_config)
self.vectorized = hasattr(self.env, "vector_reset")
self.policy_map = {}
if issubclass(policy_graph, TFPolicyGraph):
with tf.Graph().as_default():
if tf_session_creator:
self.sess = tf_session_creator()
else:
self.sess = tf.Session(config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
with self.sess.as_default():
policy = policy_graph(
self.env.observation_space, self.env.action_space,
registry, policy_config)
else:
policy = policy_graph(
self.env.observation_space, self.env.action_space,
registry, policy_config)
self.policy_map = {
"default": policy
}
self.obs_filter = get_filter(
observation_filter, self.env.observation_space.shape)
self.filters = {"obs_filter": self.obs_filter}
if self.vectorized:
raise NotImplementedError("Vector envs not yet supported")
else:
if batch_mode not in [
"pack_episodes", "truncate_episodes", "complete_episodes"]:
raise NotImplementedError("Batch mode not yet supported")
pack = batch_mode == "pack_episodes"
if batch_mode == "complete_episodes":
batch_steps = 999999
if sample_async:
self.sampler = AsyncSampler(
self.env, self.policy_map["default"], self.obs_filter,
batch_steps, pack=pack)
self.sampler.start()
else:
self.sampler = SyncSampler(
self.env, self.policy_map["default"], self.obs_filter,
batch_steps, pack=pack)
def sample(self):
"""Evaluate the current policies and return a batch of experiences.
Return:
SampleBatch from evaluating the current policies.
"""
batch = self.policy_map["default"].postprocess_trajectory(
self.sampler.get_data())
if self.compress_observations:
batch["obs"] = [pack(o) for o in batch["obs"]]
batch["new_obs"] = [pack(o) for o in batch["new_obs"]]
return batch
def apply(self, func):
"""Apply the given function to this evaluator instance."""
return func(self)
def for_policy(self, func):
"""Apply the given function to this evaluator's default policy."""
return func(self.policy_map["default"])
def sync_filters(self, new_filters):
"""Changes self's filter to given and rebases any accumulated delta.
Args:
new_filters (dict): Filters with new state to update local copy.
"""
assert all(k in new_filters for k in self.filters)
for k in self.filters:
self.filters[k].sync(new_filters[k])
def get_filters(self, flush_after=False):
"""Returns a snapshot of filters.
Args:
flush_after (bool): Clears the filter buffer state.
Returns:
return_filters (dict): Dict for serializable filters
"""
return_filters = {}
for k, f in self.filters.items():
return_filters[k] = f.as_serializable()
if flush_after:
f.clear_buffer()
return return_filters
def get_weights(self):
return self.policy_map["default"].get_weights()
def set_weights(self, weights):
return self.policy_map["default"].set_weights(weights)
def compute_gradients(self, samples):
return self.policy_map["default"].compute_gradients(samples)
def apply_gradients(self, grads):
return self.policy_map["default"].apply_gradients(grads)
def compute_apply(self, samples):
grad_fetch, apply_fetch = self.policy_map["default"].compute_apply(
samples)
return grad_fetch
def save(self):
filters = self.get_filters(flush_after=True)
state = self.policy_map["default"].get_state()
return pickle.dumps({"filters": filters, "state": state})
def restore(self, objs):
objs = pickle.loads(objs)
self.sync_filters(objs["filters"])
self.policy_map["default"].set_state(objs["state"])
+132
View File
@@ -0,0 +1,132 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
class PolicyGraph(object):
"""An agent policy and loss, i.e., a TFPolicyGraph or other subclass.
This object defines how to act in the environment, and also losses used to
improve the policy based on its experiences. Note that both policy and
loss are defined together for convenience, though the policy itself is
logically separate.
All policies can directly extend PolicyGraph, however TensorFlow users may
find TFPolicyGraph simpler to implement. TFPolicyGraph also enables RLlib
to apply TensorFlow-specific optimizations such as fusing multiple policy
graphs and multi-GPU support.
"""
def __init__(self, registry, observation_space, action_space, config):
"""Initialize the graph.
Args:
registry (obj): Object registry for user-defined envs, models, etc.
observation_space (gym.Space): Observation space of the env.
action_space (gym.Space): Action space of the env.
config (dict): Policy-specific configuration data.
"""
pass
def compute_actions(self, obs_batch, state_batches, is_training=False):
"""Compute actions for the current policy.
Arguments:
obs_batch (np.ndarray): batch of observations
state_batches (list): list of RNN state input batches, if any
is_training (bool): whether we are training the policy
Returns:
actions (np.ndarray): batch of output actions, with shape like
[BATCH_SIZE, ACTION_SHAPE].
state_outs (list): list of RNN state output batches, if any, with
shape like [STATE_SIZE, BATCH_SIZE].
info (dict): dictionary of extra feature batches, if any, with
shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}.
"""
raise NotImplementedError
def compute_single_action(self, obs, state, is_training=False):
"""Unbatched version of compute_actions.
Arguments:
obs (obj): single observation
state_batches (list): list of RNN state inputs, if any
is_training (bool): whether we are training the policy
Returns:
actions (obj): single action
state_outs (list): list of RNN state outputs, if any
info (dict): dictionary of extra features, if any
"""
[action], state_out, info = self.compute_actions(
[obs], [[s] for s in state], is_training)
return action, [s[0] for s in state_out], \
{k: v[0] for k, v in info.items()}
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
"""Implements algorithm-specific trajectory postprocessing.
Arguments:
sample_batch (SampleBatch): batch of experiences for the policy
other_agent_batches (dict): In a multi-agent env, this contains the
experience batches seen by other agents.
Returns:
SampleBatch: postprocessed sample batch.
"""
return sample_batch
def compute_gradients(self, postprocessed_batch):
"""Computes gradients against a batch of experiences.
Returns:
grads (list): List of gradient output values
info (dict): Extra policy-specific values
"""
raise NotImplementedError
def apply_gradients(self, gradients):
"""Applies previously computed gradients.
Returns:
info (dict): Extra policy-specific values
"""
raise NotImplementedError
def get_weights(self):
"""Returns model weights.
Returns:
weights (obj): Serializable copy or view of model weights
"""
raise NotImplementedError
def set_weights(self, weights):
"""Sets model weights.
Arguments:
weights (obj): Serializable copy or view of model weights
"""
raise NotImplementedError
def get_initial_state(self):
"""Returns initial RNN state for the current policy."""
return []
def get_state(self):
"""Saves all local state.
Returns:
state (obj): Serialized local state.
"""
return self.get_weights()
def set_state(self, state):
"""Restores all local state.
Arguments:
state (obj): Serialized local state.
"""
self.set_weights(state)
+5 -9
View File
@@ -11,12 +11,12 @@ def discount(x, gamma):
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
"""Given a rollout, compute its value targets and the advantage.
Args:
rollout (PartialRollout): Partial Rollout Object
reward_filter (Filter): Filter for processing advantanges
last_r (float): Value estimation for last observation
gamma (float): Parameter for GAE
lambda_ (float): Parameter for GAE
use_gae (bool): Using Generalized Advantage Estamation
@@ -32,21 +32,17 @@ def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
if use_gae:
assert "vf_preds" in rollout, "Values not found!"
vpred_t = np.stack(rollout["vf_preds"] +
[np.array(rollout.last_r)]).squeeze()
vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])])
delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
# This formula for the advantage comes
# "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
traj["advantages"] = discount(delta_t, gamma * lambda_)
traj["value_targets"] = traj["advantages"] + traj["vf_preds"]
else:
rewards_plus_v = np.stack(rollout["rewards"] +
[np.array(rollout.last_r)]).squeeze()
rewards_plus_v = np.concatenate(
[rollout["rewards"], np.array([last_r])])
traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]
for i in range(traj["advantages"].shape[0]):
traj["advantages"][i] = reward_filter(traj["advantages"][i])
traj["advantages"] = traj["advantages"].copy()
assert all(val.shape[0] == trajsize for val in traj.values()), \
+39 -106
View File
@@ -2,80 +2,12 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import six.moves.queue as queue
import threading
from collections import namedtuple
import numpy as np
import six.moves.queue as queue
import threading
class PartialRollout(object):
"""A piece of a complete rollout.
We run our agent, and process its experience once it has processed enough
steps.
Attributes:
data (dict): Stores rollout data. All numpy arrays other than
`observations` and `features` will be squeezed.
last_r (float): Value of next state. Used for bootstrapping.
"""
fields = ["obs", "actions", "rewards", "new_obs", "dones", "features"]
def __init__(self, extra_fields=None):
"""Initializers internals. Maintains a `last_r` field
in support of partial rollouts, used in bootstrapping advantage
estimation.
Args:
extra_fields: Optional field for object to keep track.
"""
if extra_fields:
self.fields.extend(extra_fields)
self.data = {k: [] for k in self.fields}
self.last_r = 0.0
def add(self, **kwargs):
for k, v in kwargs.items():
self.data[k] += [v]
def extend(self, other_rollout):
"""Extends internal data structure. Assumes other_rollout contains
data that occured afterwards."""
assert not self.is_terminal()
assert all(k in other_rollout.fields for k in self.fields)
for k, v in other_rollout.data.items():
self.data[k].extend(v)
self.last_r = other_rollout.last_r
def is_terminal(self):
"""Check if terminal.
Returns:
terminal (bool): if rollout has terminated."""
return self.data["dones"][-1]
def __getitem__(self, key):
return self.data[key]
def __setitem__(self, key, item):
self.data[key] = item
def keys(self):
return self.data.keys()
def items(self):
return self.data.items()
def __iter__(self):
return self.data.__iter__()
def __next__(self):
return self.data.__next__()
def __contains__(self, x):
return x in self.data
from ray.rllib.optimizers.sample_batch import SampleBatchBuilder
CompletedRollout = namedtuple("CompletedRollout",
@@ -92,7 +24,9 @@ class SyncSampler(object):
thread."""
_async = False
def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
def __init__(
self, env, policy, obs_filter, num_local_steps, horizon=None,
pack=False):
self.num_local_steps = num_local_steps
self.horizon = horizon
self.env = env
@@ -100,7 +34,7 @@ class SyncSampler(object):
self._obs_filter = obs_filter
self.rollout_provider = _env_runner(self.env, self.policy,
self.num_local_steps, self.horizon,
self._obs_filter)
self._obs_filter, pack)
self.metrics_queue = queue.Queue()
def get_data(self):
@@ -128,7 +62,9 @@ class AsyncSampler(threading.Thread):
accumulate and the gradient can be calculated on up to 5 batches."""
_async = True
def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
def __init__(
self, env, policy, obs_filter, num_local_steps, horizon=None,
pack=False):
assert getattr(
obs_filter, "is_concurrent",
False), ("Observation Filter must support concurrent updates.")
@@ -142,6 +78,7 @@ class AsyncSampler(threading.Thread):
self._obs_filter = obs_filter
self.started = False
self.daemon = True
self.pack = pack
def run(self):
self.started = True
@@ -154,7 +91,7 @@ class AsyncSampler(threading.Thread):
def _run(self):
rollout_provider = _env_runner(self.env, self.policy,
self.num_local_steps, self.horizon,
self._obs_filter)
self._obs_filter, self.pack)
while True:
# The timeout variable exists because apparently, if one worker
# dies, the other workers won't die with it, unless the timeout is
@@ -169,18 +106,18 @@ class AsyncSampler(threading.Thread):
"""Gets currently accumulated data.
Returns:
rollout (PartialRollout): trajectory data (unprocessed)
rollout (SampleBatch): trajectory data (unprocessed)
"""
assert self.started, "Sampler never started running!"
rollout = self.queue.get(timeout=600.0)
if isinstance(rollout, BaseException):
raise rollout
while not rollout.is_terminal():
while not rollout["dones"][-1]:
try:
part = self.queue.get_nowait()
if isinstance(part, BaseException):
raise rollout
rollout.extend(part)
rollout = rollout.concat(part)
except queue.Empty:
break
return rollout
@@ -195,7 +132,7 @@ class AsyncSampler(threading.Thread):
return completed
def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
def _env_runner(env, policy, num_local_steps, horizon, obs_filter, pack):
"""This implements the logic of the thread runner.
It continually runs the policy, and as long as the rollout exceeds a
@@ -206,12 +143,16 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
Args:
env: Environment generated by env_creator
policy: Policy used to interact with environment. Also sets fields
to be included in `PartialRollout`
num_local_steps: Number of steps before `PartialRollout` is yielded.
to be included in `SampleBatch`
num_local_steps: Number of steps before `SampleBatch` is yielded. Set
to infinity to yield complete episodes.
horizon: Horizon of the episode.
obs_filter: Filter used to process observations.
pack: Whether to pack multiple episodes into each batch. This
guarantees batches will be exactly `num_local_steps` in size.
Yields:
rollout (PartialRollout): Object containing state, action, reward,
rollout (SampleBatch): Object containing state, action, reward,
terminal condition, and other fields as dictated by `policy`.
"""
last_observation = obs_filter(env.reset())
@@ -221,24 +162,23 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
print("Warning, no horizon specified, assuming infinite")
if not horizon:
horizon = 999999
if hasattr(policy, "get_initial_features"):
last_features = policy.get_initial_features()
else:
last_features = []
last_features = policy.get_initial_state()
features = last_features
length = 0
rewards = 0
rollout_number = 0
while True:
terminal_end = False
rollout = PartialRollout(extra_fields=policy.other_output)
batch_builder = SampleBatchBuilder()
for _ in range(num_local_steps):
action, pi_info = policy.compute(last_observation, *last_features)
if policy.is_recurrent:
features = pi_info["features"]
del pi_info["features"]
# Assume batch size one for now
action, features, pi_info = policy.compute_single_action(
last_observation, last_features, is_training=True)
for i, state_value in enumerate(last_features):
pi_info["state_in_{}".format(i)] = state_value
for i, state_value in enumerate(features):
pi_info["state_out_{}".format(i)] = state_value
observation, reward, terminal, info = env.step(action)
observation = obs_filter(observation)
@@ -252,12 +192,11 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
action = np.concatenate(action, axis=0).flatten()
# Collect the experience.
rollout.add(
batch_builder.add_values(
obs=last_observation,
actions=action,
rewards=reward,
dones=terminal,
features=last_features,
new_obs=observation,
**pi_info)
@@ -265,24 +204,18 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
last_features = features
if terminal:
terminal_end = True
yield CompletedRollout(length, rewards)
if (length >= horizon
or not env.metadata.get("semantics.autoreset")):
if (length >= horizon or
not env.metadata.get("semantics.autoreset")):
last_observation = obs_filter(env.reset())
if hasattr(policy, "get_initial_features"):
last_features = policy.get_initial_features()
else:
last_features = []
last_features = policy.get_initial_state()
rollout_number += 1
length = 0
rewards = 0
break
if not terminal_end:
rollout.last_r = policy.value(last_observation, *last_features)
if not pack:
break
# Once we have enough experience, yield it, and have the ThreadRunner
# place it on a queue.
yield rollout
yield batch_builder.build()
+152
View File
@@ -0,0 +1,152 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import ray
from ray.rllib.utils.policy_graph import PolicyGraph
class TFPolicyGraph(PolicyGraph):
"""An agent policy and loss implemented in TensorFlow.
Extending this class enables RLlib to perform TensorFlow specific
optimizations on the policy graph, e.g., parallelization across gpus or
fusing multiple graphs together in the multi-agent setting.
All input and output tensors are of shape [BATCH_DIM, ...].
Examples:
>>> policy = TFPolicyGraphSubclass(
sess, obs_input, action_sampler, loss, loss_inputs, is_training)
>>> print(policy.compute_actions([1, 0, 2]))
(array([0, 1, 1]), [], {})
>>> print(policy.postprocess_trajectory(SampleBatch({...})))
SampleBatch({"action": ..., "advantages": ..., ...})
"""
def __init__(
self, sess, obs_input, action_sampler, loss, loss_inputs,
is_training, state_inputs=None, state_outputs=None):
"""Initialize the policy.
Arguments:
obs_input (Tensor): input placeholder for observations.
action_sampler (Tensor): Tensor for sampling an action.
loss (Tensor): scalar policy loss output tensor.
loss_inputs (list): a (name, placeholder) tuple for each loss
input argument. Each placeholder name must correspond to a
SampleBatch column key returned by postprocess_trajectory().
is_training (Tensor): input placeholder for whether we are
currently training the policy.
state_inputs (list): list of RNN state output Tensors.
state_outputs (list): list of initial state values.
"""
self._sess = sess
self._obs_input = obs_input
self._sampler = action_sampler
self._loss = loss
self._loss_inputs = loss_inputs
self._is_training = is_training
self._state_inputs = state_inputs or []
self._state_outputs = state_outputs or []
self._optimizer = self.optimizer()
self._grads_and_vars = self.gradients(self._optimizer)
self._grads = [g for (g, v) in self._grads_and_vars]
self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
self._variables = ray.experimental.TensorFlowVariables(
self._loss, self._sess)
assert len(self._state_inputs) == len(self._state_outputs) == \
len(self.get_initial_state())
def compute_actions(
self, obs_batch, state_batches=None, is_training=False):
state_batches = state_batches or []
assert len(self._state_inputs) == len(state_batches), \
(self._state_inputs, state_batches)
feed_dict = self.extra_compute_action_feed_dict()
feed_dict[self._obs_input] = obs_batch
feed_dict[self._is_training] = is_training
for ph, value in zip(self._state_inputs, state_batches):
feed_dict[ph] = value
fetches = self._sess.run(
([self._sampler] + self._state_outputs +
[self.extra_compute_action_fetches()]), feed_dict=feed_dict)
return fetches[0], fetches[1:-1], fetches[-1]
def _get_loss_inputs_dict(self, postprocessed_batch):
feed_dict = {}
for key, ph in self._loss_inputs:
# TODO(ekl) fix up handling of RNN inputs so that we can batch
# across multiple rollouts
if key.startswith("state_in_"):
feed_dict[ph] = postprocessed_batch[key][:1] # in state only
else:
feed_dict[ph] = postprocessed_batch[key]
return feed_dict
def compute_gradients(self, postprocessed_batch):
feed_dict = self.extra_compute_grad_feed_dict()
feed_dict[self._is_training] = True
feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
fetches = self._sess.run(
[self._grads, self.extra_compute_grad_fetches()],
feed_dict=feed_dict)
return fetches[0], fetches[1]
def apply_gradients(self, gradients):
assert len(gradients) == len(self._grads), (gradients, self._grads)
feed_dict = self.extra_apply_grad_feed_dict()
feed_dict[self._is_training] = True
for ph, value in zip(self._grads, gradients):
feed_dict[ph] = value
fetches = self.sess.run(
[self._apply_op, self.extra_apply_grad_fetches()],
feed_dict=feed_dict)
return fetches[1]
def compute_apply(self, postprocessed_batch):
feed_dict = self.extra_compute_grad_feed_dict()
feed_dict.update(self.extra_apply_grad_feed_dict())
feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
feed_dict[self._is_training] = True
fetches = self._sess.run(
[self._apply_op, self.extra_compute_grad_fetches(),
self.extra_apply_grad_fetches()],
feed_dict=feed_dict)
return fetches[1], fetches[2]
def get_weights(self):
return self._variables.get_flat()
def set_weights(self, weights):
return self._variables.set_flat(weights)
def extra_compute_action_feed_dict(self):
return {}
def extra_compute_action_fetches(self):
return {} # e.g, value function
def extra_compute_grad_feed_dict(self):
return {} # e.g, kl_coeff
def extra_compute_grad_fetches(self):
return {} # e.g, td error
def extra_apply_grad_feed_dict(self):
return {}
def extra_apply_grad_fetches(self):
return {} # e.g., batch norm updates
def optimizer(self):
return tf.train.AdamOptimizer()
def gradients(self, optimizer):
return optimizer.compute_gradients(self._loss)
+6
View File
@@ -31,6 +31,12 @@ TrainingResult = namedtuple(
# (Optional) The mean episode reward if applicable.
"episode_reward_mean",
# (Optional) The min episode reward if applicable.
"episode_reward_min",
# (Optional) The max episode reward if applicable.
"episode_reward_max",
# (Optional) The mean episode length if applicable.
"episode_len_mean",