mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 15:22:56 +08:00
[rllib] Refactor rllib to have a common sample collection pathway (#2149)
This commit is contained in:
@@ -6,6 +6,11 @@ from __future__ import print_function
|
||||
# This file is imported from the tune module in order to register RLlib agents.
|
||||
from ray.tune.registry import register_trainable
|
||||
|
||||
from ray.rllib.utils.policy_graph import PolicyGraph
|
||||
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
|
||||
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
|
||||
from ray.rllib.optimizers.sample_batch import SampleBatch
|
||||
|
||||
|
||||
def _register_all():
|
||||
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
|
||||
@@ -16,3 +21,7 @@ def _register_all():
|
||||
|
||||
|
||||
_register_all()
|
||||
|
||||
__all__ = [
|
||||
"PolicyGraph", "TFPolicyGraph", "CommonPolicyEvaluator", "SampleBatch"
|
||||
]
|
||||
|
||||
+49
-53
@@ -2,7 +2,6 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import pickle
|
||||
import os
|
||||
|
||||
@@ -10,14 +9,14 @@ import ray
|
||||
from ray.rllib.agent import Agent
|
||||
from ray.rllib.optimizers import AsyncOptimizer
|
||||
from ray.rllib.utils import FilterManager
|
||||
from ray.rllib.a3c.a3c_evaluator import A3CEvaluator, RemoteA3CEvaluator, \
|
||||
GPURemoteA3CEvaluator
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
|
||||
collect_metrics
|
||||
from ray.rllib.a3c.common import get_policy_cls
|
||||
from ray.tune.trial import Resources
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
# Number of workers (excluding master)
|
||||
"num_workers": 4,
|
||||
"num_workers": 2,
|
||||
# Size of rollout batch
|
||||
"batch_size": 10,
|
||||
# Use LSTM model - only applicable for image states
|
||||
@@ -42,6 +41,8 @@ DEFAULT_CONFIG = {
|
||||
"entropy_coeff": -0.01,
|
||||
# Whether to place workers on GPUs
|
||||
"use_gpu_for_workers": False,
|
||||
# Whether to emit extra summary stats
|
||||
"summarize": False,
|
||||
# Model and preprocessor options
|
||||
"model": {
|
||||
# (Image statespace) - Converts image to Channels = 1
|
||||
@@ -78,56 +79,48 @@ class A3CAgent(Agent):
|
||||
extra_gpu=cf["use_gpu_for_workers"] and cf["num_workers"] or 0)
|
||||
|
||||
def _init(self):
|
||||
self.local_evaluator = A3CEvaluator(
|
||||
self.registry,
|
||||
self.env_creator,
|
||||
self.config,
|
||||
self.logdir,
|
||||
start_sampler=False)
|
||||
if self.config["use_gpu_for_workers"]:
|
||||
remote_cls = GPURemoteA3CEvaluator
|
||||
self.policy_cls = get_policy_cls(self.config)
|
||||
|
||||
if self.config["use_pytorch"]:
|
||||
session_creator = None
|
||||
else:
|
||||
remote_cls = RemoteA3CEvaluator
|
||||
import tensorflow as tf
|
||||
|
||||
def session_creator():
|
||||
return tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1,
|
||||
inter_op_parallelism_threads=1,
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
|
||||
remote_cls = CommonPolicyEvaluator.as_remote(
|
||||
num_gpus=1 if self.config["use_gpu_for_workers"] else 0)
|
||||
self.local_evaluator = CommonPolicyEvaluator(
|
||||
self.env_creator, self.policy_cls,
|
||||
batch_steps=self.config["batch_size"],
|
||||
batch_mode="truncate_episodes",
|
||||
tf_session_creator=session_creator,
|
||||
registry=self.registry, env_config=self.config["env_config"],
|
||||
model_config=self.config["model"], policy_config=self.config)
|
||||
self.remote_evaluators = [
|
||||
remote_cls.remote(self.registry, self.env_creator, self.config,
|
||||
self.logdir)
|
||||
for i in range(self.config["num_workers"])
|
||||
]
|
||||
self.optimizer = AsyncOptimizer(self.config["optimizer"],
|
||||
self.local_evaluator,
|
||||
self.remote_evaluators)
|
||||
remote_cls.remote(
|
||||
self.env_creator, self.policy_cls,
|
||||
batch_steps=self.config["batch_size"],
|
||||
batch_mode="truncate_episodes", sample_async=True,
|
||||
tf_session_creator=session_creator,
|
||||
registry=self.registry, env_config=self.config["env_config"],
|
||||
model_config=self.config["model"], policy_config=self.config)
|
||||
for i in range(self.config["num_workers"])]
|
||||
|
||||
self.optimizer = AsyncOptimizer(
|
||||
self.config["optimizer"], self.local_evaluator,
|
||||
self.remote_evaluators)
|
||||
|
||||
def _train(self):
|
||||
self.optimizer.step()
|
||||
FilterManager.synchronize(self.local_evaluator.filters,
|
||||
self.remote_evaluators)
|
||||
res = self._fetch_metrics_from_remote_evaluators()
|
||||
return res
|
||||
|
||||
def _fetch_metrics_from_remote_evaluators(self):
|
||||
episode_rewards = []
|
||||
episode_lengths = []
|
||||
metric_lists = [
|
||||
a.get_completed_rollout_metrics.remote()
|
||||
for a in self.remote_evaluators
|
||||
]
|
||||
for metrics in metric_lists:
|
||||
for episode in ray.get(metrics):
|
||||
episode_lengths.append(episode.episode_length)
|
||||
episode_rewards.append(episode.episode_reward)
|
||||
avg_reward = (np.mean(episode_rewards)
|
||||
if episode_rewards else float('nan'))
|
||||
avg_length = (np.mean(episode_lengths)
|
||||
if episode_lengths else float('nan'))
|
||||
timesteps = np.sum(episode_lengths) if episode_lengths else 0
|
||||
|
||||
result = TrainingResult(
|
||||
episode_reward_mean=avg_reward,
|
||||
episode_len_mean=avg_length,
|
||||
timesteps_this_iter=timesteps,
|
||||
info={})
|
||||
|
||||
return result
|
||||
FilterManager.synchronize(
|
||||
self.local_evaluator.filters, self.remote_evaluators)
|
||||
return collect_metrics(self.local_evaluator, self.remote_evaluators)
|
||||
|
||||
def _stop(self):
|
||||
# workaround for https://github.com/ray-project/ray/issues/1516
|
||||
@@ -154,7 +147,10 @@ class A3CAgent(Agent):
|
||||
])
|
||||
self.local_evaluator.restore(extra_data["local_state"])
|
||||
|
||||
def compute_action(self, observation):
|
||||
def compute_action(self, observation, state=None):
|
||||
if state is None:
|
||||
state = []
|
||||
obs = self.local_evaluator.obs_filter(observation, update=False)
|
||||
action, info = self.local_evaluator.policy.compute(obs)
|
||||
return action
|
||||
return self.local_evaluator.for_policy(
|
||||
lambda p: p.compute_single_action(
|
||||
obs, state, is_training=False)[0])
|
||||
|
||||
@@ -1,119 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pickle
|
||||
|
||||
import ray
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.optimizers import PolicyEvaluator
|
||||
from ray.rllib.a3c.common import get_policy_cls
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.utils.sampler import AsyncSampler
|
||||
from ray.rllib.utils.process_rollout import process_rollout
|
||||
|
||||
|
||||
class A3CEvaluator(PolicyEvaluator):
|
||||
"""Actor object to start running simulation on workers.
|
||||
|
||||
The gradient computation is also executed from this object.
|
||||
|
||||
Attributes:
|
||||
policy: Copy of graph used for policy. Used by sampler and gradients.
|
||||
obs_filter: Observation filter used in environment sampling
|
||||
rew_filter: Reward filter used in rollout post-processing.
|
||||
sampler: Component for interacting with environment and generating
|
||||
rollouts.
|
||||
logdir: Directory for logging.
|
||||
"""
|
||||
def __init__(
|
||||
self, registry, env_creator, config, logdir, start_sampler=True):
|
||||
env = ModelCatalog.get_preprocessor_as_wrapper(
|
||||
registry, env_creator(config["env_config"]), config["model"])
|
||||
self.env = env
|
||||
policy_cls = get_policy_cls(config)
|
||||
# TODO(rliaw): should change this to be just env.observation_space
|
||||
self.policy = policy_cls(
|
||||
registry, env.observation_space.shape, env.action_space, config)
|
||||
self.config = config
|
||||
|
||||
# Technically not needed when not remote
|
||||
self.obs_filter = get_filter(
|
||||
config["observation_filter"], env.observation_space.shape)
|
||||
self.rew_filter = get_filter(config["reward_filter"], ())
|
||||
self.filters = {"obs_filter": self.obs_filter,
|
||||
"rew_filter": self.rew_filter}
|
||||
self.sampler = AsyncSampler(env, self.policy, self.obs_filter,
|
||||
config["batch_size"])
|
||||
if start_sampler and self.sampler._async:
|
||||
self.sampler.start()
|
||||
self.logdir = logdir
|
||||
|
||||
def sample(self):
|
||||
rollout = self.sampler.get_data()
|
||||
samples = process_rollout(
|
||||
rollout, self.rew_filter, gamma=self.config["gamma"],
|
||||
lambda_=self.config["lambda"], use_gae=True)
|
||||
return samples
|
||||
|
||||
def get_completed_rollout_metrics(self):
|
||||
"""Returns metrics on previously completed rollouts.
|
||||
|
||||
Calling this clears the queue of completed rollout metrics.
|
||||
"""
|
||||
return self.sampler.get_metrics()
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
gradient, info = self.policy.compute_gradients(samples)
|
||||
return gradient, {}
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
self.policy.apply_gradients(grads)
|
||||
|
||||
def get_weights(self):
|
||||
return self.policy.get_weights()
|
||||
|
||||
def set_weights(self, params):
|
||||
self.policy.set_weights(params)
|
||||
|
||||
def save(self):
|
||||
filters = self.get_filters(flush_after=True)
|
||||
weights = self.get_weights()
|
||||
return pickle.dumps({
|
||||
"filters": filters,
|
||||
"weights": weights})
|
||||
|
||||
def restore(self, objs):
|
||||
objs = pickle.loads(objs)
|
||||
self.sync_filters(objs["filters"])
|
||||
self.set_weights(objs["weights"])
|
||||
|
||||
def sync_filters(self, new_filters):
|
||||
"""Changes self's filter to given and rebases any accumulated delta.
|
||||
|
||||
Args:
|
||||
new_filters (dict): Filters with new state to update local copy.
|
||||
"""
|
||||
assert all(k in new_filters for k in self.filters)
|
||||
for k in self.filters:
|
||||
self.filters[k].sync(new_filters[k])
|
||||
|
||||
def get_filters(self, flush_after=False):
|
||||
"""Returns a snapshot of filters.
|
||||
|
||||
Args:
|
||||
flush_after (bool): Clears the filter buffer state.
|
||||
|
||||
Returns:
|
||||
return_filters (dict): Dict for serializable filters
|
||||
"""
|
||||
return_filters = {}
|
||||
for k, f in self.filters.items():
|
||||
return_filters[k] = f.as_serializable()
|
||||
if flush_after:
|
||||
f.clear_buffer()
|
||||
return return_filters
|
||||
|
||||
|
||||
RemoteA3CEvaluator = ray.remote(A3CEvaluator)
|
||||
GPURemoteA3CEvaluator = ray.remote(num_gpus=1)(A3CEvaluator)
|
||||
@@ -0,0 +1,103 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import gym
|
||||
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.utils.process_rollout import compute_advantages
|
||||
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
|
||||
class A3CTFPolicyGraph(TFPolicyGraph):
|
||||
"""The TF policy base class."""
|
||||
|
||||
def __init__(self, ob_space, action_space, registry, config):
|
||||
self.registry = registry
|
||||
self.local_steps = 0
|
||||
self.config = config
|
||||
self.summarize = config.get("summarize")
|
||||
|
||||
self._setup_graph(ob_space, action_space)
|
||||
assert all(hasattr(self, attr)
|
||||
for attr in ["vf", "logits", "x", "var_list"])
|
||||
print("Setting up loss")
|
||||
self.setup_loss(action_space)
|
||||
self.is_training = tf.placeholder_with_default(True, ())
|
||||
self.sess = tf.get_default_session()
|
||||
|
||||
TFPolicyGraph.__init__(
|
||||
self, self.sess, obs_input=self.x,
|
||||
action_sampler=self.action_dist.sample(), loss=self.loss,
|
||||
loss_inputs=self.loss_in, is_training=self.is_training,
|
||||
state_inputs=self.state_in, state_outputs=self.state_out)
|
||||
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
if self.summarize:
|
||||
bs = tf.to_float(tf.shape(self.x)[0])
|
||||
tf.summary.scalar("model/policy_graph", self.pi_loss / bs)
|
||||
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
|
||||
tf.summary.scalar("model/entropy", self.entropy / bs)
|
||||
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self._grads))
|
||||
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
|
||||
self.summary_op = tf.summary.merge_all()
|
||||
|
||||
def _setup_graph(self, ob_space, ac_space):
|
||||
raise NotImplementedError
|
||||
|
||||
def setup_loss(self, action_space):
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
ac_size = action_space.shape[0]
|
||||
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.ac = tf.placeholder(tf.int64, [None], name="ac")
|
||||
else:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for A3C.".format(
|
||||
action_space))
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
self.r = tf.placeholder(tf.float32, [None], name="r")
|
||||
|
||||
log_prob = self.action_dist.logp(self.ac)
|
||||
|
||||
# The "policy gradients" loss: its derivative is precisely the policy
|
||||
# gradient. Notice that self.ac is a placeholder that is provided
|
||||
# externally. adv will contain the advantages, as calculated in
|
||||
# compute_advantages.
|
||||
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
|
||||
|
||||
delta = self.vf - self.r
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(self.action_dist.entropy())
|
||||
self.loss = (self.pi_loss +
|
||||
self.vf_loss * self.config["vf_loss_coeff"] +
|
||||
self.entropy * self.config["entropy_coeff"])
|
||||
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(self.config["lr"])
|
||||
|
||||
def gradients(self, optimizer):
|
||||
grads = tf.gradients(self.loss, self.var_list)
|
||||
self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
|
||||
clipped_grads = list(zip(self.grads, self.var_list))
|
||||
return clipped_grads
|
||||
|
||||
def extra_compute_grad_fetches(self):
|
||||
if self.summarize:
|
||||
return {"summary": self.summary_op}
|
||||
else:
|
||||
return {}
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
completed = sample_batch["dones"][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
next_state = []
|
||||
for i in range(len(self.state_in)):
|
||||
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
|
||||
last_r = self.value(sample_batch["new_obs"][-1], *next_state)
|
||||
return compute_advantages(
|
||||
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
|
||||
@@ -0,0 +1,113 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from threading import Lock
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ray.rllib.models.pytorch.misc import var_to_np, convert_batch
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.utils.process_rollout import compute_advantages
|
||||
from ray.rllib.utils.policy_graph import PolicyGraph
|
||||
|
||||
|
||||
class SharedTorchPolicy(PolicyGraph):
|
||||
"""A simple, non-recurrent PyTorch policy example."""
|
||||
|
||||
def __init__(self, obs_space, action_space, registry, config):
|
||||
self.registry = registry
|
||||
self.local_steps = 0
|
||||
self.config = config
|
||||
self.summarize = config.get("summarize")
|
||||
self.setup_graph(obs_space, action_space)
|
||||
torch.set_num_threads(2)
|
||||
self.lock = Lock()
|
||||
|
||||
def setup_graph(self, obs_space, action_space):
|
||||
_, self.logit_dim = ModelCatalog.get_action_dist(action_space)
|
||||
self._model = ModelCatalog.get_torch_model(
|
||||
self.registry, obs_space.shape, self.logit_dim,
|
||||
self.config["model"])
|
||||
self.optimizer = torch.optim.Adam(
|
||||
self._model.parameters(), lr=self.config["lr"])
|
||||
|
||||
def compute_single_action(self, obs, state, is_training=False):
|
||||
assert not state, "RNN not supported"
|
||||
with self.lock:
|
||||
ob = torch.from_numpy(obs).float().unsqueeze(0)
|
||||
logits, values = self._model(ob)
|
||||
samples = F.softmax(logits, dim=1).multinomial(1).squeeze()
|
||||
values = values.squeeze()
|
||||
return var_to_np(samples), [], {"vf_preds": var_to_np(values)}
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
with self.lock:
|
||||
self.backward(samples)
|
||||
# Note that return values are just references;
|
||||
# calling zero_grad will modify the values
|
||||
return [p.grad.data.numpy() for p in self._model.parameters()], {}
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
self.optimizer.zero_grad()
|
||||
for g, p in zip(grads, self._model.parameters()):
|
||||
p.grad = torch.from_numpy(g)
|
||||
self.optimizer.step()
|
||||
return {}
|
||||
|
||||
def get_weights(self):
|
||||
# !! This only returns references to the data.
|
||||
return self._model.state_dict()
|
||||
|
||||
def set_weights(self, weights):
|
||||
with self.lock:
|
||||
self._model.load_state_dict(weights)
|
||||
|
||||
def value(self, obs):
|
||||
with self.lock:
|
||||
obs = torch.from_numpy(obs).float().unsqueeze(0)
|
||||
res = self._model.hidden_layers(obs)
|
||||
res = self._model.value_branch(res)
|
||||
res = res.squeeze()
|
||||
return var_to_np(res)
|
||||
|
||||
def forward(self, obs_batch, actions):
|
||||
logits, values = self._model(obs_batch)
|
||||
log_probs = F.log_softmax(logits, dim=1)
|
||||
probs = F.softmax(logits, dim=1)
|
||||
action_log_probs = log_probs.gather(1, actions.view(-1, 1))
|
||||
entropy = -(log_probs * probs).sum(-1).sum()
|
||||
return values, action_log_probs, entropy
|
||||
|
||||
def backward(self, sample_batch):
|
||||
"""Loss is encoded here.
|
||||
|
||||
Defining a new loss function would start by rewriting this function.
|
||||
"""
|
||||
|
||||
states, actions, advs, rs = convert_batch(sample_batch)
|
||||
values, action_log_probs, entropy = self.forward(states, actions)
|
||||
pi_err = -advs.dot(action_log_probs.reshape(-1))
|
||||
value_err = F.mse_loss(values.reshape(-1), rs)
|
||||
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
overall_err = sum([
|
||||
pi_err,
|
||||
self.config["vf_loss_coeff"] * value_err,
|
||||
self.config["entropy_coeff"] * entropy,
|
||||
])
|
||||
|
||||
overall_err.backward()
|
||||
torch.nn.utils.clip_grad_norm_(self._model.parameters(),
|
||||
self.config["grad_clip"])
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
completed = sample_batch["dones"][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
last_r = self.value(sample_batch["new_obs"][-1])
|
||||
return compute_advantages(
|
||||
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
|
||||
@@ -8,7 +8,7 @@ def get_policy_cls(config):
|
||||
from ray.rllib.a3c.shared_model_lstm import SharedModelLSTM
|
||||
policy_cls = SharedModelLSTM
|
||||
elif config["use_pytorch"]:
|
||||
from ray.rllib.a3c.shared_torch_policy import SharedTorchPolicy
|
||||
from ray.rllib.a3c.a3c_torch_policy import SharedTorchPolicy
|
||||
policy_cls = SharedTorchPolicy
|
||||
else:
|
||||
from ray.rllib.a3c.shared_model import SharedModel
|
||||
|
||||
@@ -1,28 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
class Policy(object):
|
||||
"""The policy base class."""
|
||||
def __init__(self, ob_space, action_space, name="local", summarize=True):
|
||||
pass
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
raise NotImplementedError
|
||||
|
||||
def get_weights(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def set_weights(self, weights):
|
||||
raise NotImplementedError
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
raise NotImplementedError
|
||||
|
||||
def compute(self, observations):
|
||||
"""Compute action for a _single_ observation"""
|
||||
raise NotImplementedError
|
||||
|
||||
def value(self, ob):
|
||||
raise NotImplementedError
|
||||
@@ -4,30 +4,27 @@ from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
from ray.rllib.models.misc import linear, normc_initializer
|
||||
from ray.rllib.a3c.tfpolicy import TFPolicy
|
||||
from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
|
||||
class SharedModel(TFPolicy):
|
||||
class SharedModel(A3CTFPolicyGraph):
|
||||
|
||||
other_output = ["vf_preds"]
|
||||
is_recurrent = False
|
||||
|
||||
def __init__(self, registry, ob_space, ac_space, config, **kwargs):
|
||||
def __init__(self, ob_space, ac_space, registry, config, **kwargs):
|
||||
super(SharedModel, self).__init__(
|
||||
registry, ob_space, ac_space, config, **kwargs)
|
||||
ob_space, ac_space, registry, config, **kwargs)
|
||||
|
||||
def _setup_graph(self, ob_space, ac_space):
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self._model = ModelCatalog.get_model(
|
||||
self.registry, self.x, self.logit_dim, self.config["model"])
|
||||
self.logits = self._model.outputs
|
||||
self.curr_dist = dist_class(self.logits)
|
||||
self.action_dist = dist_class(self.logits)
|
||||
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
|
||||
normc_initializer(1.0)), [-1])
|
||||
|
||||
self.sample = self.curr_dist.sample()
|
||||
self.sample = self.action_dist.sample()
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
self.global_step = tf.get_variable(
|
||||
@@ -35,28 +32,20 @@ class SharedModel(TFPolicy):
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
info = {}
|
||||
feed_dict = {
|
||||
self.x: samples["obs"],
|
||||
self.ac: samples["actions"],
|
||||
self.adv: samples["advantages"],
|
||||
self.r: samples["value_targets"],
|
||||
}
|
||||
self.grads = [g for g in self.grads if g is not None]
|
||||
self.local_steps += 1
|
||||
if self.summarize:
|
||||
grad, summ = self.sess.run([self.grads, self.summary_op],
|
||||
feed_dict=feed_dict)
|
||||
info['summary'] = summ
|
||||
else:
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
self.state_in = []
|
||||
self.state_out = []
|
||||
|
||||
def compute(self, ob, *args):
|
||||
action, vf = self.sess.run([self.sample, self.vf],
|
||||
{self.x: [ob]})
|
||||
return action[0], {"vf_preds": vf[0]}
|
||||
def setup_loss(self, action_space):
|
||||
A3CTFPolicyGraph.setup_loss(self, action_space)
|
||||
self.loss_in = [
|
||||
("obs", self.x),
|
||||
("actions", self.ac),
|
||||
("advantages", self.adv),
|
||||
("value_targets", self.r),
|
||||
]
|
||||
|
||||
def extra_compute_action_fetches(self):
|
||||
return {"vf_preds": self.vf}
|
||||
|
||||
def value(self, ob, *args):
|
||||
vf = self.sess.run(self.vf, {self.x: [ob]})
|
||||
|
||||
@@ -5,43 +5,32 @@ from __future__ import print_function
|
||||
import tensorflow as tf
|
||||
from ray.rllib.models.misc import linear, normc_initializer
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.a3c.tfpolicy import TFPolicy
|
||||
from ray.rllib.a3c.a3c_tf_policy import A3CTFPolicyGraph
|
||||
from ray.rllib.models.lstm import LSTM
|
||||
|
||||
|
||||
class SharedModelLSTM(TFPolicy):
|
||||
"""
|
||||
Attributes:
|
||||
other_output (list): Other than `action`, the other return values from
|
||||
`compute_gradients`.
|
||||
is_recurrent (bool): True if is a recurrent network (requires features
|
||||
to be tracked).
|
||||
"""
|
||||
class SharedModelLSTM(A3CTFPolicyGraph):
|
||||
|
||||
other_output = ["vf_preds", "features"]
|
||||
is_recurrent = True
|
||||
|
||||
def __init__(self, registry, ob_space, ac_space, config, **kwargs):
|
||||
def __init__(self, ob_space, ac_space, registry, config, **kwargs):
|
||||
super(SharedModelLSTM, self).__init__(
|
||||
registry, ob_space, ac_space, config, **kwargs)
|
||||
ob_space, ac_space, registry, config, **kwargs)
|
||||
|
||||
def _setup_graph(self, ob_space, ac_space):
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self._model = LSTM(self.x, self.logit_dim, {})
|
||||
|
||||
self.state_init = self._model.state_init
|
||||
self.state_in = self._model.state_in
|
||||
self.state_out = self._model.state_out
|
||||
|
||||
self.logits = self._model.outputs
|
||||
self.curr_dist = dist_class(self.logits)
|
||||
self.action_dist = dist_class(self.logits)
|
||||
# with tf.variable_scope("vf"):
|
||||
# vf_model = ModelCatalog.get_model(self.x, 1)
|
||||
self.vf = tf.reshape(linear(self._model.last_layer, 1, "value",
|
||||
normc_initializer(1.0)), [-1])
|
||||
|
||||
self.sample = self.curr_dist.sample()
|
||||
self.sample = self.action_dist.sample()
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
self.global_step = tf.get_variable(
|
||||
@@ -49,42 +38,25 @@ class SharedModelLSTM(TFPolicy):
|
||||
initializer=tf.constant_initializer(0, dtype=tf.int32),
|
||||
trainable=False)
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
"""Computing the gradient is actually model-dependent.
|
||||
def get_initial_state(self):
|
||||
return self._model.state_init
|
||||
|
||||
The LSTM needs its hidden states in order to compute the gradient
|
||||
accurately.
|
||||
"""
|
||||
features = samples["features"][0]
|
||||
feed_dict = {
|
||||
self.x: samples["obs"],
|
||||
self.ac: samples["actions"],
|
||||
self.adv: samples["advantages"],
|
||||
self.r: samples["value_targets"],
|
||||
self.state_in[0]: features[0],
|
||||
self.state_in[1]: features[1]
|
||||
}
|
||||
info = {}
|
||||
self.local_steps += 1
|
||||
if self.summarize and self.local_steps % 10 == 0:
|
||||
grad, summ = self.sess.run([self.grads, self.summary_op],
|
||||
feed_dict=feed_dict)
|
||||
info['summary'] = summ
|
||||
else:
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
def setup_loss(self, action_space):
|
||||
A3CTFPolicyGraph.setup_loss(self, action_space)
|
||||
self.loss_in = [
|
||||
("obs", self.x),
|
||||
("actions", self.ac),
|
||||
("advantages", self.adv),
|
||||
("value_targets", self.r),
|
||||
("state_in_0", self.state_in[0]),
|
||||
("state_in_1", self.state_in[1]),
|
||||
]
|
||||
|
||||
def compute(self, ob, c, h):
|
||||
action, vf, c, h = self.sess.run(
|
||||
[self.sample, self.vf] + self.state_out,
|
||||
{self.x: [ob], self.state_in[0]: c, self.state_in[1]: h})
|
||||
return action[0], {"vf_preds": vf[0], "features": (c, h)}
|
||||
def extra_compute_action_fetches(self):
|
||||
return {"vf_preds": self.vf}
|
||||
|
||||
def value(self, ob, c, h):
|
||||
vf = self.sess.run(self.vf, {self.x: [ob],
|
||||
self.state_in[0]: c,
|
||||
self.state_in[1]: h})
|
||||
return vf[0]
|
||||
|
||||
def get_initial_features(self):
|
||||
return self.state_init
|
||||
|
||||
@@ -1,106 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
import ray
|
||||
import gym
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
|
||||
|
||||
class TFPolicy(Policy):
|
||||
"""The policy base class."""
|
||||
def __init__(self, registry, ob_space, action_space, config,
|
||||
name="local", summarize=True):
|
||||
self.registry = registry
|
||||
self.local_steps = 0
|
||||
self.config = config
|
||||
self.summarize = summarize
|
||||
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
|
||||
self.g = tf.Graph()
|
||||
with self.g.as_default(), tf.device(worker_device):
|
||||
with tf.variable_scope(name):
|
||||
self._setup_graph(ob_space, action_space)
|
||||
assert all(hasattr(self, attr)
|
||||
for attr in ["vf", "logits", "x", "var_list"])
|
||||
print("Setting up loss")
|
||||
self.setup_loss(action_space)
|
||||
self.setup_gradients()
|
||||
self.initialize()
|
||||
|
||||
def _setup_graph(self, ob_space, ac_space):
|
||||
raise NotImplementedError
|
||||
|
||||
def setup_loss(self, action_space):
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
ac_size = action_space.shape[0]
|
||||
self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.ac = tf.placeholder(tf.int64, [None], name="ac")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
self.r = tf.placeholder(tf.float32, [None], name="r")
|
||||
|
||||
log_prob = self.curr_dist.logp(self.ac)
|
||||
|
||||
# The "policy gradients" loss: its derivative is precisely the policy
|
||||
# gradient. Notice that self.ac is a placeholder that is provided
|
||||
# externally. adv will contain the advantages, as calculated in
|
||||
# process_rollout.
|
||||
self.pi_loss = - tf.reduce_sum(log_prob * self.adv)
|
||||
|
||||
delta = self.vf - self.r
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(self.curr_dist.entropy())
|
||||
self.loss = (self.pi_loss +
|
||||
self.vf_loss * self.config["vf_loss_coeff"] +
|
||||
self.entropy * self.config["entropy_coeff"])
|
||||
|
||||
def setup_gradients(self):
|
||||
grads = tf.gradients(self.loss, self.var_list)
|
||||
self.grads, _ = tf.clip_by_global_norm(grads, self.config["grad_clip"])
|
||||
grads_and_vars = list(zip(self.grads, self.var_list))
|
||||
opt = tf.train.AdamOptimizer(self.config["lr"])
|
||||
self._apply_gradients = opt.apply_gradients(grads_and_vars)
|
||||
|
||||
def initialize(self):
|
||||
if self.summarize:
|
||||
bs = tf.to_float(tf.shape(self.x)[0])
|
||||
tf.summary.scalar("model/policy_loss", self.pi_loss / bs)
|
||||
tf.summary.scalar("model/value_loss", self.vf_loss / bs)
|
||||
tf.summary.scalar("model/entropy", self.entropy / bs)
|
||||
tf.summary.scalar("model/grad_gnorm", tf.global_norm(self.grads))
|
||||
tf.summary.scalar("model/var_gnorm", tf.global_norm(self.var_list))
|
||||
self.summary_op = tf.summary.merge_all()
|
||||
|
||||
# TODO(rliaw): Can consider exposing these parameters
|
||||
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.loss,
|
||||
self.sess)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
feed_dict = {self.grads[i]: grads[i]
|
||||
for i in range(len(grads))}
|
||||
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
|
||||
|
||||
def get_weights(self):
|
||||
weights = self.variables.get_weights()
|
||||
return weights
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
raise NotImplementedError
|
||||
|
||||
def compute(self, observation):
|
||||
raise NotImplementedError
|
||||
|
||||
def value(self, ob):
|
||||
raise NotImplementedError
|
||||
@@ -61,7 +61,7 @@ class Agent(Trainable):
|
||||
"""
|
||||
|
||||
_allow_unknown_configs = False
|
||||
_allow_unknown_subkeys = []
|
||||
_allow_unknown_subkeys = ["env_config", "model", "optimizer"]
|
||||
|
||||
@classmethod
|
||||
def resource_help(cls, config):
|
||||
|
||||
@@ -17,8 +17,7 @@ class BCEvaluator(PolicyEvaluator):
|
||||
env = ModelCatalog.get_preprocessor_as_wrapper(registry, env_creator(
|
||||
config["env_config"]), config["model"])
|
||||
self.dataset = ExperienceDataset(config["dataset_path"])
|
||||
# TODO(rliaw): should change this to be just env.observation_space
|
||||
self.policy = BCPolicy(registry, env.observation_space.shape,
|
||||
self.policy = BCPolicy(registry, env.observation_space,
|
||||
env.action_space, config)
|
||||
self.config = config
|
||||
self.logdir = logdir
|
||||
|
||||
@@ -6,30 +6,22 @@ import tensorflow as tf
|
||||
import gym
|
||||
|
||||
import ray
|
||||
from ray.rllib.a3c.policy import Policy
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
|
||||
class BCPolicy(Policy):
|
||||
def __init__(self, registry, ob_space, action_space, config, name="local",
|
||||
summarize=True):
|
||||
super(BCPolicy, self).__init__(ob_space, action_space, name, summarize)
|
||||
class BCPolicy(object):
|
||||
def __init__(self, registry, obs_space, action_space, config):
|
||||
self.registry = registry
|
||||
self.local_steps = 0
|
||||
self.config = config
|
||||
self.summarize = summarize
|
||||
worker_device = "/job:localhost/replica:0/task:0/cpu:0"
|
||||
self.g = tf.Graph()
|
||||
with self.g.as_default(), tf.device(worker_device):
|
||||
with tf.variable_scope(name):
|
||||
self._setup_graph(ob_space, action_space)
|
||||
print("Setting up loss")
|
||||
self.setup_loss(action_space)
|
||||
self.setup_gradients()
|
||||
self.initialize()
|
||||
self.summarize = config.get("summarize")
|
||||
self._setup_graph(obs_space, action_space)
|
||||
self.setup_loss(action_space)
|
||||
self.setup_gradients()
|
||||
self.initialize()
|
||||
|
||||
def _setup_graph(self, ob_space, ac_space):
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(ob_space))
|
||||
def _setup_graph(self, obs_space, ac_space):
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self._model = ModelCatalog.get_model(
|
||||
self.registry, self.x, self.logit_dim, self.config["model"])
|
||||
|
||||
@@ -8,25 +8,25 @@ from ray.utils import merge_dicts
|
||||
APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
|
||||
DDPG_CONFIG,
|
||||
{
|
||||
'optimizer_class': 'ApexOptimizer',
|
||||
'optimizer_config':
|
||||
"optimizer_class": "ApexOptimizer",
|
||||
"optimizer_config":
|
||||
merge_dicts(
|
||||
DDPG_CONFIG['optimizer_config'], {
|
||||
'max_weight_sync_delay': 400,
|
||||
'num_replay_buffer_shards': 4,
|
||||
'debug': False
|
||||
DDPG_CONFIG["optimizer_config"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
'n_step': 3,
|
||||
'num_workers': 32,
|
||||
'buffer_size': 2000000,
|
||||
'learning_starts': 50000,
|
||||
'train_batch_size': 512,
|
||||
'sample_batch_size': 50,
|
||||
'max_weight_sync_delay': 400,
|
||||
'target_network_update_freq': 500000,
|
||||
'timesteps_per_iteration': 25000,
|
||||
'per_worker_exploration': True,
|
||||
'worker_side_prioritization': True,
|
||||
"n_step": 3,
|
||||
"num_workers": 32,
|
||||
"buffer_size": 2000000,
|
||||
"learning_starts": 50000,
|
||||
"train_batch_size": 512,
|
||||
"sample_batch_size": 50,
|
||||
"max_weight_sync_delay": 400,
|
||||
"target_network_update_freq": 500000,
|
||||
"timesteps_per_iteration": 25000,
|
||||
"per_worker_exploration": True,
|
||||
"worker_side_prioritization": True,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
+60
-195
@@ -2,17 +2,9 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pickle
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib import optimizers
|
||||
from ray.rllib.ddpg.ddpg_evaluator import DDPGEvaluator
|
||||
from ray.rllib.agent import Agent
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
|
||||
from ray.rllib.dqn.dqn import DQNAgent
|
||||
from ray.rllib.ddpg.ddpg_policy_graph import DDPGPolicyGraph
|
||||
|
||||
OPTIMIZER_SHARED_CONFIGS = [
|
||||
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
|
||||
@@ -23,247 +15,120 @@ OPTIMIZER_SHARED_CONFIGS = [
|
||||
DEFAULT_CONFIG = {
|
||||
# === Model ===
|
||||
# Hidden layer sizes of the policy networks
|
||||
'actor_hiddens': [64, 64],
|
||||
"actor_hiddens": [64, 64],
|
||||
# Hidden layer sizes of the policy networks
|
||||
'critic_hiddens': [64, 64],
|
||||
"critic_hiddens": [64, 64],
|
||||
# N-step Q learning
|
||||
'n_step': 1,
|
||||
"n_step": 1,
|
||||
# Config options to pass to the model constructor
|
||||
'model': {},
|
||||
"model": {},
|
||||
# Discount factor for the MDP
|
||||
'gamma': 0.99,
|
||||
"gamma": 0.99,
|
||||
# Arguments to pass to the env creator
|
||||
'env_config': {},
|
||||
"env_config": {},
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
# exploration_fraction
|
||||
'schedule_max_timesteps': 100000,
|
||||
"schedule_max_timesteps": 100000,
|
||||
# Number of env steps to optimize for before returning
|
||||
'timesteps_per_iteration': 1000,
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
'exploration_fraction': 0.1,
|
||||
"exploration_fraction": 0.1,
|
||||
# Final value of random action probability
|
||||
'exploration_final_eps': 0.02,
|
||||
"exploration_final_eps": 0.02,
|
||||
# OU-noise scale
|
||||
'noise_scale': 0.1,
|
||||
"noise_scale": 0.1,
|
||||
# theta
|
||||
'exploration_theta': 0.15,
|
||||
"exploration_theta": 0.15,
|
||||
# sigma
|
||||
'exploration_sigma': 0.2,
|
||||
"exploration_sigma": 0.2,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
'target_network_update_freq': 0,
|
||||
"target_network_update_freq": 0,
|
||||
# Update the target by \tau * policy + (1-\tau) * target_policy
|
||||
'tau': 0.002,
|
||||
# Whether to start with random actions instead of noops.
|
||||
'random_starts': True,
|
||||
"tau": 0.002,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
# each worker will have a replay buffer of this size.
|
||||
'buffer_size': 50000,
|
||||
"buffer_size": 50000,
|
||||
# If True prioritized replay buffer will be used.
|
||||
'prioritized_replay': True,
|
||||
"prioritized_replay": True,
|
||||
# Alpha parameter for prioritized replay buffer.
|
||||
'prioritized_replay_alpha': 0.6,
|
||||
"prioritized_replay_alpha": 0.6,
|
||||
# Beta parameter for sampling from prioritized replay buffer.
|
||||
'prioritized_replay_beta': 0.4,
|
||||
"prioritized_replay_beta": 0.4,
|
||||
# Epsilon to add to the TD errors when updating priorities.
|
||||
'prioritized_replay_eps': 1e-6,
|
||||
"prioritized_replay_eps": 1e-6,
|
||||
# Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
|
||||
'clip_rewards': True,
|
||||
"clip_rewards": True,
|
||||
|
||||
# === Optimization ===
|
||||
# Learning rate for adam optimizer
|
||||
'actor_lr': 1e-4,
|
||||
'critic_lr': 1e-3,
|
||||
"actor_lr": 1e-4,
|
||||
"critic_lr": 1e-3,
|
||||
# If True, use huber loss instead of squared loss for critic network
|
||||
# Conventionally, no need to clip gradients if using a huber loss
|
||||
'use_huber': False,
|
||||
"use_huber": False,
|
||||
# Threshold of a huber loss
|
||||
'huber_threshold': 1.0,
|
||||
"huber_threshold": 1.0,
|
||||
# Weights for L2 regularization
|
||||
'l2_reg': 1e-6,
|
||||
"l2_reg": 1e-6,
|
||||
# If not None, clip gradients during optimization at this value
|
||||
'grad_norm_clipping': None,
|
||||
"grad_norm_clipping": None,
|
||||
# How many steps of the model to sample before learning starts.
|
||||
'learning_starts': 1500,
|
||||
"learning_starts": 1500,
|
||||
# Update the replay buffer with this many samples at once. Note that this
|
||||
# setting applies per-worker if num_workers > 1.
|
||||
'sample_batch_size': 1,
|
||||
"sample_batch_size": 1,
|
||||
# Size of a batched sampled from replay buffer for training. Note that
|
||||
# if async_updates is set, then each worker returns gradients for a
|
||||
# batch of this size.
|
||||
'train_batch_size': 256,
|
||||
# Smooth the current average reward over this many previous episodes.
|
||||
'smoothing_num_episodes': 100,
|
||||
|
||||
# === Tensorflow ===
|
||||
# Arguments to pass to tensorflow
|
||||
'tf_session_args': {
|
||||
"device_count": {
|
||||
"CPU": 2
|
||||
},
|
||||
"log_device_placement": False,
|
||||
"allow_soft_placement": True,
|
||||
"gpu_options": {
|
||||
"allow_growth": True
|
||||
},
|
||||
"inter_op_parallelism_threads": 1,
|
||||
"intra_op_parallelism_threads": 1,
|
||||
},
|
||||
"train_batch_size": 256,
|
||||
|
||||
# === Parallelism ===
|
||||
# Whether to use a GPU for local optimization.
|
||||
"gpu": False,
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you're using the Async or Ape-X optimizers.
|
||||
'num_workers': 0,
|
||||
# you"re using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to allocate GPUs for workers (if > 0).
|
||||
'num_gpus_per_worker': 0,
|
||||
"num_gpus_per_worker": 0,
|
||||
# Whether to allocate CPUs for workers (if > 0).
|
||||
"num_cpus_per_worker": 1,
|
||||
# Optimizer class to use.
|
||||
'optimizer_class': "LocalSyncReplayOptimizer",
|
||||
"optimizer_class": "LocalSyncReplayOptimizer",
|
||||
# Config to pass to the optimizer.
|
||||
'optimizer_config': {},
|
||||
"optimizer_config": {},
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
'per_worker_exploration': False,
|
||||
"per_worker_exploration": False,
|
||||
# Whether to compute priorities on workers.
|
||||
'worker_side_prioritization': False
|
||||
"worker_side_prioritization": False
|
||||
}
|
||||
|
||||
|
||||
class DDPGAgent(Agent):
|
||||
class DDPGAgent(DQNAgent):
|
||||
_agent_name = "DDPG"
|
||||
_allow_unknown_subkeys = [
|
||||
"model", "optimizer", "tf_session_args", "env_config"
|
||||
]
|
||||
"model", "optimizer", "tf_session_args", "env_config"]
|
||||
_default_config = DEFAULT_CONFIG
|
||||
_policy_graph = DDPGPolicyGraph
|
||||
|
||||
def _init(self):
|
||||
self.local_evaluator = DDPGEvaluator(self.registry, self.env_creator,
|
||||
self.config, self.logdir, 0)
|
||||
remote_cls = ray.remote(
|
||||
num_cpus=1,
|
||||
num_gpus=self.config["num_gpus_per_worker"])(DDPGEvaluator)
|
||||
self.remote_evaluators = [
|
||||
remote_cls.remote(self.registry, self.env_creator, self.config,
|
||||
self.logdir, i)
|
||||
for i in range(self.config["num_workers"])
|
||||
]
|
||||
|
||||
for k in OPTIMIZER_SHARED_CONFIGS:
|
||||
if k not in self.config["optimizer_config"]:
|
||||
self.config["optimizer_config"][k] = self.config[k]
|
||||
|
||||
self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
|
||||
self.config["optimizer_config"], self.local_evaluator,
|
||||
self.remote_evaluators)
|
||||
|
||||
self.saver = tf.train.Saver(max_to_keep=None)
|
||||
self.last_target_update_ts = 0
|
||||
self.num_target_updates = 0
|
||||
|
||||
@property
|
||||
def global_timestep(self):
|
||||
return self.optimizer.num_steps_sampled
|
||||
|
||||
def update_target_if_needed(self):
|
||||
if self.global_timestep - self.last_target_update_ts > \
|
||||
self.config["target_network_update_freq"]:
|
||||
self.local_evaluator.update_target()
|
||||
self.last_target_update_ts = self.global_timestep
|
||||
self.num_target_updates += 1
|
||||
|
||||
def _train(self):
|
||||
start_timestep = self.global_timestep
|
||||
|
||||
while (self.global_timestep - start_timestep <
|
||||
self.config["timesteps_per_iteration"]):
|
||||
|
||||
self.optimizer.step()
|
||||
self.update_target_if_needed()
|
||||
|
||||
self.local_evaluator.set_global_timestep(self.global_timestep)
|
||||
for e in self.remote_evaluators:
|
||||
e.set_global_timestep.remote(self.global_timestep)
|
||||
|
||||
return self._train_stats(start_timestep)
|
||||
|
||||
def _train_stats(self, start_timestep):
|
||||
if self.remote_evaluators:
|
||||
stats = ray.get([e.stats.remote() for e in self.remote_evaluators])
|
||||
else:
|
||||
stats = self.local_evaluator.stats()
|
||||
if not isinstance(stats, list):
|
||||
stats = [stats]
|
||||
|
||||
mean_100ep_reward = 0.0
|
||||
mean_100ep_length = 0.0
|
||||
num_episodes = 0
|
||||
explorations = []
|
||||
|
||||
def _make_exploration_schedule(self, worker_index):
|
||||
# Override DQN's schedule to take into account `noise_scale`
|
||||
if self.config["per_worker_exploration"]:
|
||||
# Return stats from workers with the lowest 20% of exploration
|
||||
test_stats = stats[-int(max(1, len(stats) * 0.2)):]
|
||||
assert self.config["num_workers"] > 1, \
|
||||
"This requires multiple workers"
|
||||
return ConstantSchedule(
|
||||
self.config["noise_scale"] * 0.4 **
|
||||
(1 + worker_index / float(self.config["num_workers"] - 1) * 7))
|
||||
else:
|
||||
test_stats = stats
|
||||
|
||||
for s in test_stats:
|
||||
mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
|
||||
mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
|
||||
|
||||
for s in stats:
|
||||
num_episodes += s["num_episodes"]
|
||||
explorations.append(s["exploration"])
|
||||
|
||||
opt_stats = self.optimizer.stats()
|
||||
|
||||
result = TrainingResult(
|
||||
episode_reward_mean=mean_100ep_reward,
|
||||
episode_len_mean=mean_100ep_length,
|
||||
episodes_total=num_episodes,
|
||||
timesteps_this_iter=self.global_timestep - start_timestep,
|
||||
info=dict({
|
||||
"min_exploration": min(explorations),
|
||||
"max_exploration": max(explorations),
|
||||
"num_target_updates": self.num_target_updates,
|
||||
}, **opt_stats))
|
||||
|
||||
return result
|
||||
|
||||
def _stop(self):
|
||||
# workaround for https://github.com/ray-project/ray/issues/1516
|
||||
for ev in self.remote_evaluators:
|
||||
ev.__ray_terminate__.remote()
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
checkpoint_path = self.saver.save(
|
||||
self.local_evaluator.sess,
|
||||
os.path.join(checkpoint_dir, "checkpoint"),
|
||||
global_step=self.iteration)
|
||||
extra_data = [
|
||||
self.local_evaluator.save(),
|
||||
ray.get([e.save.remote() for e in self.remote_evaluators]),
|
||||
self.optimizer.save(), self.num_target_updates,
|
||||
self.last_target_update_ts
|
||||
]
|
||||
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
|
||||
return checkpoint_path
|
||||
|
||||
def _restore(self, checkpoint_path):
|
||||
self.saver.restore(self.local_evaluator.sess, checkpoint_path)
|
||||
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
|
||||
self.local_evaluator.restore(extra_data[0])
|
||||
ray.get([
|
||||
e.restore.remote(d)
|
||||
for (d, e) in zip(extra_data[1], self.remote_evaluators)
|
||||
])
|
||||
self.optimizer.restore(extra_data[2])
|
||||
self.num_target_updates = extra_data[3]
|
||||
self.last_target_update_ts = extra_data[4]
|
||||
|
||||
def compute_action(self, observation):
|
||||
return self.local_evaluator.ddpg_graph.act(self.local_evaluator.sess,
|
||||
np.array(observation)[None],
|
||||
0.0)[0]
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(self.config["exploration_fraction"] *
|
||||
self.config["schedule_max_timesteps"]),
|
||||
initial_p=self.config["noise_scale"] * 1.0,
|
||||
final_p=self.config["noise_scale"] *
|
||||
self.config["exploration_final_eps"])
|
||||
|
||||
@@ -1,186 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Box
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.ddpg import models
|
||||
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
|
||||
from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
|
||||
from ray.rllib.utils.compression import pack
|
||||
from ray.rllib.dqn.dqn_evaluator import adjust_nstep
|
||||
from ray.rllib.dqn.common.wrappers import wrap_dqn
|
||||
|
||||
|
||||
class DDPGEvaluator(PolicyEvaluator):
|
||||
"""The base DDPG Evaluator."""
|
||||
|
||||
def __init__(self, registry, env_creator, config, logdir, worker_index):
|
||||
env = env_creator(config["env_config"])
|
||||
env = wrap_dqn(registry, env, config["model"], config["random_starts"])
|
||||
self.env = env
|
||||
self.config = config
|
||||
|
||||
# when env.action_space is of Box type, e.g., Pendulum-v0
|
||||
# action_space.low is [-2.0], high is [2.0]
|
||||
# take action by calling, e.g., env.step([3.5])
|
||||
if not isinstance(env.action_space, Box):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DDPG.".format(
|
||||
env.action_space))
|
||||
|
||||
tf_config = tf.ConfigProto(**config["tf_session_args"])
|
||||
self.sess = tf.Session(config=tf_config)
|
||||
self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir)
|
||||
|
||||
# Use either a different `eps` per worker, or a linear schedule.
|
||||
if config["per_worker_exploration"]:
|
||||
assert config["num_workers"] > 1, "This requires multiple workers"
|
||||
self.exploration = ConstantSchedule(
|
||||
config["noise_scale"] * 0.4 **
|
||||
(1 + worker_index / float(config["num_workers"] - 1) * 7))
|
||||
else:
|
||||
self.exploration = LinearSchedule(
|
||||
schedule_timesteps=int(config["exploration_fraction"] *
|
||||
config["schedule_max_timesteps"]),
|
||||
initial_p=config["noise_scale"] * 1.0,
|
||||
final_p=config["noise_scale"] *
|
||||
config["exploration_final_eps"])
|
||||
|
||||
# Initialize the parameters and copy them to the target network.
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
# hard instead of soft
|
||||
self.ddpg_graph.update_target(self.sess, 1.0)
|
||||
self.global_timestep = 0
|
||||
self.local_timestep = 0
|
||||
|
||||
# Note that this encompasses both the policy and Q-value networks and
|
||||
# their corresponding target networks
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess)
|
||||
|
||||
self.episode_rewards = [0.0]
|
||||
self.episode_lengths = [0.0]
|
||||
self.saved_mean_reward = None
|
||||
|
||||
self.obs = self.env.reset()
|
||||
|
||||
def set_global_timestep(self, global_timestep):
|
||||
self.global_timestep = global_timestep
|
||||
|
||||
def update_target(self):
|
||||
self.ddpg_graph.update_target(self.sess)
|
||||
|
||||
def sample(self):
|
||||
obs, actions, rewards, new_obs, dones = [], [], [], [], []
|
||||
for _ in range(
|
||||
self.config["sample_batch_size"] + self.config["n_step"] - 1):
|
||||
ob, act, rew, ob1, done = self._step(self.global_timestep)
|
||||
obs.append(ob)
|
||||
actions.append(act)
|
||||
rewards.append(rew)
|
||||
new_obs.append(ob1)
|
||||
dones.append(done)
|
||||
|
||||
# N-step Q adjustments
|
||||
if self.config["n_step"] > 1:
|
||||
# Adjust for steps lost from truncation
|
||||
self.local_timestep -= (self.config["n_step"] - 1)
|
||||
adjust_nstep(self.config["n_step"], self.config["gamma"], obs,
|
||||
actions, rewards, new_obs, dones)
|
||||
|
||||
batch = SampleBatch({
|
||||
"obs": [pack(np.array(o)) for o in obs],
|
||||
"actions": actions,
|
||||
"rewards": rewards,
|
||||
"new_obs": [pack(np.array(o)) for o in new_obs],
|
||||
"dones": dones,
|
||||
"weights": np.ones_like(rewards)
|
||||
})
|
||||
assert (batch.count == self.config["sample_batch_size"])
|
||||
|
||||
# Prioritize on the worker side
|
||||
if self.config["worker_side_prioritization"]:
|
||||
td_errors = self.ddpg_graph.compute_td_error(
|
||||
self.sess, obs, batch["actions"], batch["rewards"], new_obs,
|
||||
batch["dones"], batch["weights"])
|
||||
new_priorities = (
|
||||
np.abs(td_errors) + self.config["prioritized_replay_eps"])
|
||||
batch.data["weights"] = new_priorities
|
||||
|
||||
return batch
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
td_err, grads = self.ddpg_graph.compute_gradients(
|
||||
self.sess, samples["obs"], samples["actions"], samples["rewards"],
|
||||
samples["new_obs"], samples["dones"], samples["weights"])
|
||||
return grads, {"td_error": td_err}
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
self.ddpg_graph.apply_gradients(self.sess, grads)
|
||||
|
||||
def compute_apply(self, samples):
|
||||
td_error = self.ddpg_graph.compute_apply(
|
||||
self.sess, samples["obs"], samples["actions"], samples["rewards"],
|
||||
samples["new_obs"], samples["dones"], samples["weights"])
|
||||
return {"td_error": td_error}
|
||||
|
||||
def get_weights(self):
|
||||
return self.variables.get_weights()
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
def _step(self, global_timestep):
|
||||
"""Takes a single step, and returns the result of the step."""
|
||||
action = self.ddpg_graph.act(
|
||||
self.sess,
|
||||
np.array(self.obs)[None],
|
||||
self.exploration.value(global_timestep))[0]
|
||||
new_obs, rew, done, _ = self.env.step(action)
|
||||
ret = (self.obs, action, rew, new_obs, float(done))
|
||||
self.obs = new_obs
|
||||
self.episode_rewards[-1] += rew
|
||||
self.episode_lengths[-1] += 1
|
||||
if done:
|
||||
self.obs = self.env.reset()
|
||||
self.episode_rewards.append(0.0)
|
||||
self.episode_lengths.append(0.0)
|
||||
# reset UO noise for each episode
|
||||
self.ddpg_graph.reset_noise(self.sess)
|
||||
|
||||
self.local_timestep += 1
|
||||
return ret
|
||||
|
||||
def stats(self):
|
||||
n = self.config["smoothing_num_episodes"] + 1
|
||||
mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
|
||||
mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
|
||||
exploration = self.exploration.value(self.global_timestep)
|
||||
return {
|
||||
"mean_100ep_reward": mean_100ep_reward,
|
||||
"mean_100ep_length": mean_100ep_length,
|
||||
"num_episodes": len(self.episode_rewards),
|
||||
"exploration": exploration,
|
||||
"local_timestep": self.local_timestep,
|
||||
}
|
||||
|
||||
def save(self):
|
||||
return [
|
||||
self.exploration, self.episode_rewards, self.episode_lengths,
|
||||
self.saved_mean_reward, self.obs, self.global_timestep,
|
||||
self.local_timestep
|
||||
]
|
||||
|
||||
def restore(self, data):
|
||||
self.exploration = data[0]
|
||||
self.episode_rewards = data[1]
|
||||
self.episode_lengths = data[2]
|
||||
self.saved_mean_reward = data[3]
|
||||
self.obs = data[4]
|
||||
self.global_timestep = data[5]
|
||||
self.local_timestep = data[6]
|
||||
@@ -0,0 +1,327 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Box
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
import ray
|
||||
from ray.rllib.dqn.dqn_policy_graph import _huber_loss, _minimize_and_clip, \
|
||||
_scope_vars, _postprocess_dqn
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
|
||||
A_SCOPE = "a_func"
|
||||
P_SCOPE = "p_func"
|
||||
P_TARGET_SCOPE = "target_p_func"
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
|
||||
|
||||
def _build_p_network(registry, inputs, dim_actions, config):
|
||||
"""
|
||||
map an observation (i.e., state) to an action where
|
||||
each entry takes value from (0, 1) due to the sigmoid function
|
||||
"""
|
||||
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
|
||||
|
||||
hiddens = config["actor_hiddens"]
|
||||
action_out = frontend.last_layer
|
||||
for hidden in hiddens:
|
||||
action_out = layers.fully_connected(
|
||||
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
|
||||
# Use sigmoid layer to bound values within (0, 1)
|
||||
# shape of action_scores is [batch_size, dim_actions]
|
||||
action_scores = layers.fully_connected(
|
||||
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
|
||||
|
||||
return action_scores
|
||||
|
||||
|
||||
# As a stochastic policy for inference, but a deterministic policy for training
|
||||
# thus ignore batch_size issue when constructing a stochastic action
|
||||
def _build_action_network(p_values, low_action, high_action, stochastic, eps,
|
||||
theta, sigma):
|
||||
# shape is [None, dim_action]
|
||||
deterministic_actions = (high_action - low_action) * p_values + low_action
|
||||
|
||||
exploration_sample = tf.get_variable(
|
||||
name="ornstein_uhlenbeck",
|
||||
dtype=tf.float32,
|
||||
initializer=low_action.size * [.0],
|
||||
trainable=False)
|
||||
normal_sample = tf.random_normal(
|
||||
shape=[low_action.size], mean=0.0, stddev=1.0)
|
||||
exploration_value = tf.assign_add(
|
||||
exploration_sample,
|
||||
theta * (.0 - exploration_sample) + sigma * normal_sample)
|
||||
stochastic_actions = deterministic_actions + eps * (
|
||||
high_action - low_action) * exploration_value
|
||||
|
||||
return tf.cond(stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
|
||||
|
||||
def _build_q_network(registry, inputs, action_inputs, config):
|
||||
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
|
||||
|
||||
hiddens = config["critic_hiddens"]
|
||||
|
||||
q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
|
||||
for hidden in hiddens:
|
||||
q_out = layers.fully_connected(
|
||||
q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
|
||||
q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
|
||||
|
||||
return q_scores
|
||||
|
||||
|
||||
class DDPGPolicyGraph(TFPolicyGraph):
|
||||
def __init__(self, observation_space, action_space, registry, config):
|
||||
if not isinstance(action_space, Box):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DDPG.".format(
|
||||
action_space))
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
dim_actions = action_space.shape[0]
|
||||
low_action = action_space.low
|
||||
high_action = action_space.high
|
||||
self.actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["actor_lr"])
|
||||
self.critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["critic_lr"])
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + observation_space.shape)
|
||||
|
||||
# Actor: P (policy) network
|
||||
with tf.variable_scope(P_SCOPE) as scope:
|
||||
p_values = _build_p_network(registry, self.cur_observations,
|
||||
dim_actions, config)
|
||||
self.p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE):
|
||||
self.output_actions = _build_action_network(
|
||||
p_values, low_action, high_action, self.stochastic, self.eps,
|
||||
config["exploration_theta"], config["exploration_sigma"])
|
||||
|
||||
with tf.variable_scope(A_SCOPE, reuse=True):
|
||||
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
|
||||
self.reset_noise_op = tf.assign(exploration_sample,
|
||||
dim_actions * [.0])
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
tf.float32,
|
||||
shape=(None, ) + observation_space.shape,
|
||||
name="observation")
|
||||
self.act_t = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + action_space.shape, name="action")
|
||||
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
|
||||
self.obs_tp1 = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + observation_space.shape)
|
||||
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
|
||||
self.importance_weights = tf.placeholder(
|
||||
tf.float32, [None], name="weight")
|
||||
|
||||
# p network evaluation
|
||||
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
|
||||
self.p_t = _build_p_network(
|
||||
registry, self.obs_t, dim_actions, config)
|
||||
|
||||
# target p network evaluation
|
||||
with tf.variable_scope(P_TARGET_SCOPE) as scope:
|
||||
p_tp1 = _build_p_network(
|
||||
registry, self.obs_tp1, dim_actions, config)
|
||||
target_p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope(A_SCOPE, reuse=True):
|
||||
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
|
||||
zero_eps = tf.constant(value=.0, dtype=tf.float32)
|
||||
output_actions = _build_action_network(
|
||||
self.p_t, low_action, high_action, deterministic_flag,
|
||||
zero_eps, config["exploration_theta"],
|
||||
config["exploration_sigma"])
|
||||
|
||||
output_actions_estimated = _build_action_network(
|
||||
p_tp1, low_action, high_action, deterministic_flag,
|
||||
zero_eps, config["exploration_theta"],
|
||||
config["exploration_sigma"])
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_t = _build_q_network(
|
||||
registry, self.obs_t, self.act_t, config)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp0 = _build_q_network(
|
||||
registry, self.obs_t, output_actions, config)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = _build_q_network(
|
||||
registry, self.obs_tp1, output_actions_estimated, config)
|
||||
target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
|
||||
|
||||
q_tp1_best = tf.squeeze(
|
||||
input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = (
|
||||
self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
if config.get("use_huber"):
|
||||
errors = _huber_loss(self.td_error, config.get("huber_threshold"))
|
||||
else:
|
||||
errors = 0.5 * tf.square(self.td_error)
|
||||
|
||||
self.loss = tf.reduce_mean(self.importance_weights * errors)
|
||||
|
||||
# for policy gradient
|
||||
self.actor_loss = -1.0 * tf.reduce_mean(q_tp0)
|
||||
|
||||
if config["l2_reg"] is not None:
|
||||
for var in self.p_func_vars:
|
||||
if "bias" not in var.name:
|
||||
self.actor_loss += (
|
||||
config["l2_reg"] * 0.5 * tf.nn.l2_loss(var))
|
||||
for var in self.q_func_vars:
|
||||
if "bias" not in var.name:
|
||||
self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
|
||||
var)
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
self.tau_value = config.get("tau")
|
||||
self.tau = tf.placeholder(tf.float32, (), name="tau")
|
||||
update_target_expr = []
|
||||
for var, var_target in zip(
|
||||
sorted(self.q_func_vars, key=lambda v: v.name),
|
||||
sorted(target_q_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
for var, var_target in zip(
|
||||
sorted(self.p_func_vars, key=lambda v: v.name),
|
||||
sorted(target_p_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
self.sess = tf.get_default_session()
|
||||
self.loss_inputs = [
|
||||
("obs", self.obs_t),
|
||||
("actions", self.act_t),
|
||||
("rewards", self.rew_t),
|
||||
("new_obs", self.obs_tp1),
|
||||
("dones", self.done_mask),
|
||||
("weights", self.importance_weights),
|
||||
]
|
||||
self.is_training = tf.placeholder_with_default(True, ())
|
||||
TFPolicyGraph.__init__(
|
||||
self, self.sess, obs_input=self.cur_observations,
|
||||
action_sampler=self.output_actions, loss=self.loss,
|
||||
loss_inputs=self.loss_inputs, is_training=self.is_training)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
# Note that this encompasses both the policy and Q-value networks and
|
||||
# their corresponding target networks
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
tf.group(q_tp0, q_tp1), self.sess)
|
||||
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
def gradients(self, optimizer):
|
||||
if self.config["grad_norm_clipping"] is not None:
|
||||
actor_grads_and_vars = _minimize_and_clip(
|
||||
self.actor_optimizer,
|
||||
self.actor_loss,
|
||||
var_list=self.p_func_vars,
|
||||
clip_val=self.config["grad_norm_clipping"])
|
||||
critic_grads_and_vars = _minimize_and_clip(
|
||||
self.critic_optimizer,
|
||||
self.loss,
|
||||
var_list=self.q_func_vars,
|
||||
clip_val=self.config["grad_norm_clipping"])
|
||||
else:
|
||||
actor_grads_and_vars = self.actor_optimizer.compute_gradients(
|
||||
self.actor_loss, var_list=self.p_func_vars)
|
||||
critic_grads_and_vars = self.critic_optimizer.compute_gradients(
|
||||
self.loss, var_list=self.q_func_vars)
|
||||
actor_grads_and_vars = [
|
||||
(g, v) for (g, v) in actor_grads_and_vars if g is not None]
|
||||
critic_grads_and_vars = [
|
||||
(g, v) for (g, v) in critic_grads_and_vars if g is not None]
|
||||
grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
|
||||
return grads_and_vars
|
||||
|
||||
def extra_compute_action_feed_dict(self):
|
||||
return {
|
||||
self.stochastic: True,
|
||||
self.eps: self.cur_epsilon,
|
||||
}
|
||||
|
||||
def extra_compute_grad_fetches(self):
|
||||
return {
|
||||
"td_error": self.td_error,
|
||||
}
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
return _postprocess_dqn(self, sample_batch)
|
||||
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = self.sess.run(
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.obs_t: [np.array(ob) for ob in obs_t],
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
def reset_noise(self, sess):
|
||||
sess.run(self.reset_noise_op)
|
||||
|
||||
# support both hard and soft sync
|
||||
def update_target(self, tau=None):
|
||||
return self.sess.run(
|
||||
self.update_target_expr,
|
||||
feed_dict={self.tau: tau or self.tau_value})
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
self.cur_epsilon = epsilon
|
||||
|
||||
def get_weights(self):
|
||||
return self.variables.get_weights()
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
def get_state(self):
|
||||
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
|
||||
|
||||
def set_state(self, state):
|
||||
TFPolicyGraph.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
@@ -1,391 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
|
||||
def _build_p_network(registry, inputs, dim_actions, config):
|
||||
"""
|
||||
map an observation (i.e., state) to an action where
|
||||
each entry takes value from (0, 1) due to the sigmoid function
|
||||
"""
|
||||
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
|
||||
|
||||
hiddens = config["actor_hiddens"]
|
||||
action_out = frontend.last_layer
|
||||
for hidden in hiddens:
|
||||
action_out = layers.fully_connected(
|
||||
action_out, num_outputs=hidden, activation_fn=tf.nn.relu)
|
||||
# Use sigmoid layer to bound values within (0, 1)
|
||||
# shape of action_scores is [batch_size, dim_actions]
|
||||
action_scores = layers.fully_connected(
|
||||
action_out, num_outputs=dim_actions, activation_fn=tf.nn.sigmoid)
|
||||
|
||||
return action_scores
|
||||
|
||||
|
||||
# As a stochastic policy for inference, but a deterministic policy for training
|
||||
# thus ignore batch_size issue when constructing a stochastic action
|
||||
def _build_action_network(p_values, low_action, high_action, stochastic, eps,
|
||||
theta, sigma):
|
||||
# shape is [None, dim_action]
|
||||
deterministic_actions = (high_action - low_action) * p_values + low_action
|
||||
|
||||
exploration_sample = tf.get_variable(
|
||||
name="ornstein_uhlenbeck",
|
||||
dtype=tf.float32,
|
||||
initializer=low_action.size * [.0],
|
||||
trainable=False)
|
||||
normal_sample = tf.random_normal(
|
||||
shape=[low_action.size], mean=0.0, stddev=1.0)
|
||||
exploration_value = tf.assign_add(
|
||||
exploration_sample,
|
||||
theta * (.0 - exploration_sample) + sigma * normal_sample)
|
||||
stochastic_actions = deterministic_actions + eps * (
|
||||
high_action - low_action) * exploration_value
|
||||
|
||||
return tf.cond(stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
|
||||
|
||||
def _build_q_network(registry, inputs, action_inputs, config):
|
||||
frontend = ModelCatalog.get_model(registry, inputs, 1, config["model"])
|
||||
|
||||
hiddens = config["critic_hiddens"]
|
||||
|
||||
q_out = tf.concat([frontend.last_layer, action_inputs], axis=1)
|
||||
for hidden in hiddens:
|
||||
q_out = layers.fully_connected(
|
||||
q_out, num_outputs=hidden, activation_fn=tf.nn.relu)
|
||||
q_scores = layers.fully_connected(q_out, num_outputs=1, activation_fn=None)
|
||||
|
||||
return q_scores
|
||||
|
||||
|
||||
def _huber_loss(x, delta=1.0):
|
||||
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
|
||||
return tf.where(
|
||||
tf.abs(x) < delta,
|
||||
tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
|
||||
|
||||
|
||||
def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
||||
"""Minimized `objective` using `optimizer` w.r.t. variables in
|
||||
`var_list` while ensure the norm of the gradients for each
|
||||
variable is clipped to `clip_val`
|
||||
"""
|
||||
gradients = optimizer.compute_gradients(objective, var_list=var_list)
|
||||
for i, (grad, var) in enumerate(gradients):
|
||||
if grad is not None:
|
||||
gradients[i] = (tf.clip_by_norm(grad, clip_val), var)
|
||||
return gradients
|
||||
|
||||
|
||||
def _scope_vars(scope, trainable_only=False):
|
||||
"""
|
||||
Get variables inside a scope
|
||||
The scope can be specified as a string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
scope: str or VariableScope
|
||||
scope in which the variables reside.
|
||||
trainable_only: bool
|
||||
whether or not to return only the variables that were marked as
|
||||
trainable.
|
||||
|
||||
Returns
|
||||
-------
|
||||
vars: [tf.Variable]
|
||||
list of variables in `scope`.
|
||||
"""
|
||||
return tf.get_collection(
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES
|
||||
if trainable_only else tf.GraphKeys.VARIABLES,
|
||||
scope=scope if isinstance(scope, str) else scope.name)
|
||||
|
||||
|
||||
class ModelAndLoss(object):
|
||||
"""Holds the model and loss function.
|
||||
|
||||
Both graphs are necessary in order for the multi-gpu SGD implementation
|
||||
to create towers on each device.
|
||||
"""
|
||||
|
||||
def __init__(self, registry, dim_actions, low_action, high_action, config,
|
||||
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
|
||||
# p network evaluation
|
||||
with tf.variable_scope("p_func", reuse=True) as scope:
|
||||
self.p_t = _build_p_network(registry, obs_t, dim_actions, config)
|
||||
|
||||
# target p network evaluation
|
||||
with tf.variable_scope("target_p_func") as scope:
|
||||
self.p_tp1 = _build_p_network(registry, obs_tp1, dim_actions,
|
||||
config)
|
||||
self.target_p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
with tf.variable_scope("a_func", reuse=True):
|
||||
deterministic_flag = tf.constant(value=False, dtype=tf.bool)
|
||||
zero_eps = tf.constant(value=.0, dtype=tf.float32)
|
||||
output_actions = _build_action_network(
|
||||
self.p_t, low_action, high_action, deterministic_flag,
|
||||
zero_eps, config["exploration_theta"],
|
||||
config["exploration_sigma"])
|
||||
|
||||
output_actions_estimated = _build_action_network(
|
||||
self.p_tp1, low_action, high_action, deterministic_flag,
|
||||
zero_eps, config["exploration_theta"],
|
||||
config["exploration_sigma"])
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope("q_func") as scope:
|
||||
self.q_t = _build_q_network(registry, obs_t, act_t, config)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
with tf.variable_scope("q_func", reuse=True):
|
||||
self.q_tp0 = _build_q_network(registry, obs_t, output_actions,
|
||||
config)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope("target_q_func") as scope:
|
||||
self.q_tp1 = _build_q_network(registry, obs_tp1,
|
||||
output_actions_estimated, config)
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
q_t_selected = tf.squeeze(self.q_t, axis=len(self.q_t.shape) - 1)
|
||||
|
||||
q_tp1_best = tf.squeeze(
|
||||
input=self.q_tp1, axis=len(self.q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = (
|
||||
rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
if config.get("use_huber"):
|
||||
errors = _huber_loss(self.td_error, config.get("huber_threshold"))
|
||||
else:
|
||||
errors = 0.5 * tf.square(self.td_error)
|
||||
|
||||
weighted_error = tf.reduce_mean(importance_weights * errors)
|
||||
|
||||
self.loss = weighted_error
|
||||
|
||||
# for policy gradient
|
||||
self.actor_loss = -1.0 * tf.reduce_mean(self.q_tp0)
|
||||
|
||||
|
||||
class DDPGGraph(object):
|
||||
def __init__(self, registry, env, config, logdir):
|
||||
self.env = env
|
||||
dim_actions = env.action_space.shape[0]
|
||||
low_action = env.action_space.low
|
||||
high_action = env.action_space.high
|
||||
actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["actor_lr"])
|
||||
critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["critic_lr"])
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + env.observation_space.shape)
|
||||
|
||||
# Actor: P (policy) network
|
||||
p_scope_name = "p_func"
|
||||
with tf.variable_scope(p_scope_name) as scope:
|
||||
p_values = _build_p_network(registry, self.cur_observations,
|
||||
dim_actions, config)
|
||||
p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
a_scope_name = "a_func"
|
||||
with tf.variable_scope(a_scope_name):
|
||||
self.output_actions = _build_action_network(
|
||||
p_values, low_action, high_action, self.stochastic, self.eps,
|
||||
config["exploration_theta"], config["exploration_sigma"])
|
||||
|
||||
with tf.variable_scope(a_scope_name, reuse=True):
|
||||
exploration_sample = tf.get_variable(name="ornstein_uhlenbeck")
|
||||
self.reset_noise_op = tf.assign(exploration_sample,
|
||||
dim_actions * [.0])
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
tf.float32,
|
||||
shape=(None, ) + env.observation_space.shape,
|
||||
name="observation")
|
||||
self.act_t = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + env.action_space.shape, name="action")
|
||||
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
|
||||
self.obs_tp1 = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + env.observation_space.shape)
|
||||
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
|
||||
self.importance_weights = tf.placeholder(
|
||||
tf.float32, [None], name="weight")
|
||||
|
||||
def build_loss(obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
return ModelAndLoss(registry, dim_actions, low_action, high_action,
|
||||
config, obs_t, act_t, rew_t, obs_tp1,
|
||||
done_mask, importance_weights)
|
||||
|
||||
self.loss_inputs = [
|
||||
("obs", self.obs_t),
|
||||
("actions", self.act_t),
|
||||
("rewards", self.rew_t),
|
||||
("new_obs", self.obs_tp1),
|
||||
("dones", self.done_mask),
|
||||
("weights", self.importance_weights),
|
||||
]
|
||||
|
||||
loss_obj = build_loss(self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
|
||||
self.done_mask, self.importance_weights)
|
||||
|
||||
self.build_loss = build_loss
|
||||
|
||||
actor_loss = loss_obj.actor_loss
|
||||
weighted_error = loss_obj.loss
|
||||
q_func_vars = loss_obj.q_func_vars
|
||||
target_p_func_vars = loss_obj.target_p_func_vars
|
||||
target_q_func_vars = loss_obj.target_q_func_vars
|
||||
self.p_t = loss_obj.p_t
|
||||
self.q_t = loss_obj.q_t
|
||||
self.q_tp0 = loss_obj.q_tp0
|
||||
self.q_tp1 = loss_obj.q_tp1
|
||||
self.td_error = loss_obj.td_error
|
||||
|
||||
if config["l2_reg"] is not None:
|
||||
for var in p_func_vars:
|
||||
if "bias" not in var.name:
|
||||
actor_loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)
|
||||
for var in q_func_vars:
|
||||
if "bias" not in var.name:
|
||||
weighted_error += config["l2_reg"] * 0.5 * tf.nn.l2_loss(
|
||||
var)
|
||||
|
||||
# compute optimization op (potentially with gradient clipping)
|
||||
if config["grad_norm_clipping"] is not None:
|
||||
self.actor_grads_and_vars = _minimize_and_clip(
|
||||
actor_optimizer,
|
||||
actor_loss,
|
||||
var_list=p_func_vars,
|
||||
clip_val=config["grad_norm_clipping"])
|
||||
self.critic_grads_and_vars = _minimize_and_clip(
|
||||
critic_optimizer,
|
||||
weighted_error,
|
||||
var_list=q_func_vars,
|
||||
clip_val=config["grad_norm_clipping"])
|
||||
else:
|
||||
self.actor_grads_and_vars = actor_optimizer.compute_gradients(
|
||||
actor_loss, var_list=p_func_vars)
|
||||
self.critic_grads_and_vars = critic_optimizer.compute_gradients(
|
||||
weighted_error, var_list=q_func_vars)
|
||||
self.actor_grads_and_vars = [(g, v)
|
||||
for (g, v) in self.actor_grads_and_vars
|
||||
if g is not None]
|
||||
self.critic_grads_and_vars = [(g, v)
|
||||
for (g, v) in self.critic_grads_and_vars
|
||||
if g is not None]
|
||||
self.grads_and_vars = (
|
||||
self.actor_grads_and_vars + self.critic_grads_and_vars)
|
||||
self.grads = [g for (g, v) in self.grads_and_vars]
|
||||
self.actor_train_expr = actor_optimizer.apply_gradients(
|
||||
self.actor_grads_and_vars)
|
||||
self.critic_train_expr = critic_optimizer.apply_gradients(
|
||||
self.critic_grads_and_vars)
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
self.tau_value = config.get("tau")
|
||||
self.tau = tf.placeholder(tf.float32, (), name="tau")
|
||||
update_target_expr = []
|
||||
for var, var_target in zip(
|
||||
sorted(q_func_vars, key=lambda v: v.name),
|
||||
sorted(target_q_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
for var, var_target in zip(
|
||||
sorted(p_func_vars, key=lambda v: v.name),
|
||||
sorted(target_p_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
# support both hard and soft sync
|
||||
def update_target(self, sess, tau=None):
|
||||
return sess.run(
|
||||
self.update_target_expr,
|
||||
feed_dict={self.tau: tau or self.tau_value})
|
||||
|
||||
def act(self, sess, obs, eps, stochastic=True):
|
||||
return sess.run(
|
||||
self.output_actions,
|
||||
feed_dict={
|
||||
self.cur_observations: obs,
|
||||
self.stochastic: stochastic,
|
||||
self.eps: eps
|
||||
})
|
||||
|
||||
def compute_gradients(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err, grads = sess.run(
|
||||
[self.td_error, self.grads],
|
||||
feed_dict={
|
||||
self.obs_t: obs_t,
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: obs_tp1,
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err, grads
|
||||
|
||||
def compute_td_error(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = sess.run(
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.obs_t: [np.array(ob) for ob in obs_t],
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
def apply_gradients(self, sess, grads):
|
||||
assert len(grads) == len(self.grads_and_vars)
|
||||
feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
|
||||
sess.run(
|
||||
[self.critic_train_expr, self.actor_train_expr],
|
||||
feed_dict=feed_dict)
|
||||
|
||||
def compute_apply(self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err, _, _ = sess.run(
|
||||
[self.td_error, self.critic_train_expr, self.actor_train_expr],
|
||||
feed_dict={
|
||||
self.obs_t: obs_t,
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: obs_tp1,
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
def reset_noise(self, sess):
|
||||
sess.run(self.reset_noise_op)
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.ddpg2.models import DDPGModel
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.optimizers import PolicyEvaluator
|
||||
from ray.rllib.utils.filter import NoFilter
|
||||
from ray.rllib.utils.process_rollout import process_rollout
|
||||
from ray.rllib.utils.process_rollout import compute_advantages
|
||||
from ray.rllib.utils.sampler import SyncSampler
|
||||
|
||||
|
||||
@@ -34,9 +34,7 @@ class DDPGEvaluator(PolicyEvaluator):
|
||||
|
||||
# since each sample is one step, no discounting needs to be applied;
|
||||
# this does not involve config["gamma"]
|
||||
samples = process_rollout(
|
||||
rollout, NoFilter(),
|
||||
gamma=1.0, use_gae=False)
|
||||
samples = compute_advantages(rollout, 0.0, gamma=1.0, use_gae=False)
|
||||
|
||||
return samples
|
||||
|
||||
|
||||
@@ -227,7 +227,7 @@ class DDPGActorCritic():
|
||||
self.critic_vars.set_weights(critic_weights)
|
||||
self.actor_vars.set_weights(actor_weights)
|
||||
|
||||
def compute(self, ob):
|
||||
def compute_single_action(self, ob, h, is_training):
|
||||
"""Returns action, given state."""
|
||||
flattened_ob = np.reshape(ob, [-1, np.prod(ob.shape)])
|
||||
action = self.sess.run(self.output_action, {self.obs: flattened_ob})
|
||||
@@ -235,7 +235,10 @@ class DDPGActorCritic():
|
||||
action += self.epsilon * self.rand_process.sample()
|
||||
if (self.epsilon > 0):
|
||||
self.epsilon -= self.config["noise_epsilon"]
|
||||
return action[0], {}
|
||||
return action[0], [], {}
|
||||
|
||||
def value(self, *args):
|
||||
return 0
|
||||
|
||||
def get_initial_state(self):
|
||||
return []
|
||||
|
||||
@@ -9,26 +9,26 @@ from ray.utils import merge_dicts
|
||||
APEX_DEFAULT_CONFIG = merge_dicts(
|
||||
DQN_CONFIG,
|
||||
{
|
||||
'optimizer_class': 'ApexOptimizer',
|
||||
'optimizer_config':
|
||||
"optimizer_class": "ApexOptimizer",
|
||||
"optimizer_config":
|
||||
merge_dicts(
|
||||
DQN_CONFIG['optimizer_config'], {
|
||||
'max_weight_sync_delay': 400,
|
||||
'num_replay_buffer_shards': 4,
|
||||
'debug': False
|
||||
DQN_CONFIG["optimizer_config"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
'n_step': 3,
|
||||
'gpu': True,
|
||||
'num_workers': 32,
|
||||
'buffer_size': 2000000,
|
||||
'learning_starts': 50000,
|
||||
'train_batch_size': 512,
|
||||
'sample_batch_size': 50,
|
||||
'max_weight_sync_delay': 400,
|
||||
'target_network_update_freq': 500000,
|
||||
'timesteps_per_iteration': 25000,
|
||||
'per_worker_exploration': True,
|
||||
'worker_side_prioritization': True,
|
||||
"n_step": 3,
|
||||
"gpu": True,
|
||||
"num_workers": 32,
|
||||
"buffer_size": 2000000,
|
||||
"learning_starts": 50000,
|
||||
"train_batch_size": 512,
|
||||
"sample_batch_size": 50,
|
||||
"max_weight_sync_delay": 400,
|
||||
"target_network_update_freq": 500000,
|
||||
"timesteps_per_iteration": 25000,
|
||||
"per_worker_exploration": True,
|
||||
"worker_side_prioritization": True,
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
+97
-117
@@ -5,14 +5,13 @@ from __future__ import print_function
|
||||
import pickle
|
||||
import os
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib import optimizers
|
||||
from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
|
||||
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
|
||||
from ray.rllib.dqn.dqn_policy_graph import DQNPolicyGraph
|
||||
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
|
||||
collect_metrics
|
||||
from ray.rllib.agent import Agent
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.tune.trial import Resources
|
||||
|
||||
|
||||
@@ -24,101 +23,84 @@ OPTIMIZER_SHARED_CONFIGS = [
|
||||
DEFAULT_CONFIG = {
|
||||
# === Model ===
|
||||
# Whether to use dueling dqn
|
||||
'dueling': True,
|
||||
"dueling": True,
|
||||
# Whether to use double dqn
|
||||
'double_q': True,
|
||||
"double_q": True,
|
||||
# Hidden layer sizes of the state and action value networks
|
||||
'hiddens': [256],
|
||||
"hiddens": [256],
|
||||
# N-step Q learning
|
||||
'n_step': 1,
|
||||
"n_step": 1,
|
||||
# Config options to pass to the model constructor
|
||||
'model': {},
|
||||
"model": {},
|
||||
# Discount factor for the MDP
|
||||
'gamma': 0.99,
|
||||
"gamma": 0.99,
|
||||
# Arguments to pass to the env creator
|
||||
'env_config': {},
|
||||
"env_config": {},
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
# exploration_fraction
|
||||
'schedule_max_timesteps': 100000,
|
||||
"schedule_max_timesteps": 100000,
|
||||
# Number of env steps to optimize for before returning
|
||||
'timesteps_per_iteration': 1000,
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
'exploration_fraction': 0.1,
|
||||
"exploration_fraction": 0.1,
|
||||
# Final value of random action probability
|
||||
'exploration_final_eps': 0.02,
|
||||
"exploration_final_eps": 0.02,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
'target_network_update_freq': 500,
|
||||
# Whether to start with random actions instead of noops.
|
||||
'random_starts': True,
|
||||
"target_network_update_freq": 500,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
# each worker will have a replay buffer of this size.
|
||||
'buffer_size': 50000,
|
||||
"buffer_size": 50000,
|
||||
# If True prioritized replay buffer will be used.
|
||||
'prioritized_replay': True,
|
||||
"prioritized_replay": True,
|
||||
# Alpha parameter for prioritized replay buffer.
|
||||
'prioritized_replay_alpha': 0.6,
|
||||
"prioritized_replay_alpha": 0.6,
|
||||
# Beta parameter for sampling from prioritized replay buffer.
|
||||
'prioritized_replay_beta': 0.4,
|
||||
"prioritized_replay_beta": 0.4,
|
||||
# Epsilon to add to the TD errors when updating priorities.
|
||||
'prioritized_replay_eps': 1e-6,
|
||||
"prioritized_replay_eps": 1e-6,
|
||||
# Whether to clip rewards to [-1, 1] prior to adding to the replay buffer.
|
||||
'clip_rewards': True,
|
||||
"clip_rewards": True,
|
||||
|
||||
# === Optimization ===
|
||||
# Learning rate for adam optimizer
|
||||
'lr': 5e-4,
|
||||
"lr": 5e-4,
|
||||
# If not None, clip gradients during optimization at this value
|
||||
'grad_norm_clipping': 40,
|
||||
"grad_norm_clipping": 40,
|
||||
# How many steps of the model to sample before learning starts.
|
||||
'learning_starts': 1000,
|
||||
"learning_starts": 1000,
|
||||
# Update the replay buffer with this many samples at once. Note that
|
||||
# this setting applies per-worker if num_workers > 1.
|
||||
'sample_batch_size': 4,
|
||||
"sample_batch_size": 4,
|
||||
# Size of a batched sampled from replay buffer for training. Note that
|
||||
# if async_updates is set, then each worker returns gradients for a
|
||||
# batch of this size.
|
||||
'train_batch_size': 32,
|
||||
# Smooth the current average reward over this many previous episodes.
|
||||
'smoothing_num_episodes': 100,
|
||||
|
||||
# === Tensorflow ===
|
||||
# Arguments to pass to tensorflow
|
||||
'tf_session_args': {
|
||||
"device_count": {"CPU": 2},
|
||||
"log_device_placement": False,
|
||||
"allow_soft_placement": True,
|
||||
"gpu_options": {
|
||||
"allow_growth": True
|
||||
},
|
||||
"inter_op_parallelism_threads": 1,
|
||||
"intra_op_parallelism_threads": 1,
|
||||
},
|
||||
"train_batch_size": 32,
|
||||
|
||||
# === Parallelism ===
|
||||
# Whether to use a GPU for local optimization.
|
||||
'gpu': False,
|
||||
"gpu": False,
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you're using the Async or Ape-X optimizers.
|
||||
'num_workers': 0,
|
||||
# you"re using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to allocate GPUs for workers (if > 0).
|
||||
'num_gpus_per_worker': 0,
|
||||
"num_gpus_per_worker": 0,
|
||||
# Whether to allocate CPUs for workers (if > 0).
|
||||
'num_cpus_per_worker': 1,
|
||||
"num_cpus_per_worker": 1,
|
||||
# Optimizer class to use.
|
||||
'optimizer_class': "LocalSyncReplayOptimizer",
|
||||
"optimizer_class": "LocalSyncReplayOptimizer",
|
||||
# Config to pass to the optimizer.
|
||||
'optimizer_config': {},
|
||||
"optimizer_config": {},
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
'per_worker_exploration': False,
|
||||
"per_worker_exploration": False,
|
||||
# Whether to compute priorities on workers.
|
||||
'worker_side_prioritization': False
|
||||
"worker_side_prioritization": False
|
||||
}
|
||||
|
||||
|
||||
@@ -127,6 +109,7 @@ class DQNAgent(Agent):
|
||||
_allow_unknown_subkeys = [
|
||||
"model", "optimizer", "tf_session_args", "env_config"]
|
||||
_default_config = DEFAULT_CONFIG
|
||||
_policy_graph = DQNPolicyGraph
|
||||
|
||||
@classmethod
|
||||
def default_resource_request(cls, config):
|
||||
@@ -137,16 +120,31 @@ class DQNAgent(Agent):
|
||||
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
|
||||
|
||||
def _init(self):
|
||||
self.local_evaluator = DQNEvaluator(
|
||||
self.registry, self.env_creator, self.config, self.logdir, 0)
|
||||
remote_cls = ray.remote(
|
||||
adjusted_batch_size = (
|
||||
self.config["sample_batch_size"] + self.config["n_step"] - 1)
|
||||
self.local_evaluator = CommonPolicyEvaluator(
|
||||
self.env_creator, self._policy_graph,
|
||||
batch_steps=adjusted_batch_size,
|
||||
batch_mode="pack_episodes", preprocessor_pref="deepmind",
|
||||
compress_observations=True,
|
||||
registry=self.registry, env_config=self.config["env_config"],
|
||||
model_config=self.config["model"], policy_config=self.config)
|
||||
remote_cls = CommonPolicyEvaluator.as_remote(
|
||||
num_cpus=self.config["num_cpus_per_worker"],
|
||||
num_gpus=self.config["num_gpus_per_worker"])(
|
||||
DQNEvaluator)
|
||||
num_gpus=self.config["num_gpus_per_worker"])
|
||||
self.remote_evaluators = [
|
||||
remote_cls.remote(
|
||||
self.registry, self.env_creator, self.config, self.logdir,
|
||||
i)
|
||||
self.env_creator, self._policy_graph,
|
||||
batch_steps=adjusted_batch_size,
|
||||
batch_mode="pack_episodes", preprocessor_pref="deepmind",
|
||||
compress_observations=True,
|
||||
registry=self.registry, env_config=self.config["env_config"],
|
||||
model_config=self.config["model"], policy_config=self.config)
|
||||
for _ in range(self.config["num_workers"])]
|
||||
|
||||
self.exploration0 = self._make_exploration_schedule(0)
|
||||
self.explorations = [
|
||||
self._make_exploration_schedule(i)
|
||||
for i in range(self.config["num_workers"])]
|
||||
|
||||
for k in OPTIMIZER_SHARED_CONFIGS:
|
||||
@@ -157,10 +155,25 @@ class DQNAgent(Agent):
|
||||
self.config["optimizer_config"], self.local_evaluator,
|
||||
self.remote_evaluators)
|
||||
|
||||
self.saver = tf.train.Saver(max_to_keep=None)
|
||||
self.last_target_update_ts = 0
|
||||
self.num_target_updates = 0
|
||||
|
||||
def _make_exploration_schedule(self, worker_index):
|
||||
# Use either a different `eps` per worker, or a linear schedule.
|
||||
if self.config["per_worker_exploration"]:
|
||||
assert self.config["num_workers"] > 1, \
|
||||
"This requires multiple workers"
|
||||
return ConstantSchedule(
|
||||
0.4 ** (
|
||||
1 + worker_index / float(
|
||||
self.config["num_workers"] - 1) * 7))
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(
|
||||
self.config["exploration_fraction"] *
|
||||
self.config["schedule_max_timesteps"]),
|
||||
initial_p=1.0,
|
||||
final_p=self.config["exploration_final_eps"])
|
||||
|
||||
@property
|
||||
def global_timestep(self):
|
||||
return self.optimizer.num_steps_sampled
|
||||
@@ -168,7 +181,7 @@ class DQNAgent(Agent):
|
||||
def update_target_if_needed(self):
|
||||
if self.global_timestep - self.last_target_update_ts > \
|
||||
self.config["target_network_update_freq"]:
|
||||
self.local_evaluator.update_target()
|
||||
self.local_evaluator.for_policy(lambda p: p.update_target())
|
||||
self.last_target_update_ts = self.global_timestep
|
||||
self.num_target_updates += 1
|
||||
|
||||
@@ -177,58 +190,25 @@ class DQNAgent(Agent):
|
||||
|
||||
while (self.global_timestep - start_timestep <
|
||||
self.config["timesteps_per_iteration"]):
|
||||
|
||||
self.optimizer.step()
|
||||
self.update_target_if_needed()
|
||||
|
||||
self.local_evaluator.set_global_timestep(self.global_timestep)
|
||||
for e in self.remote_evaluators:
|
||||
e.set_global_timestep.remote(self.global_timestep)
|
||||
exp_vals = [self.exploration0.value(self.global_timestep)]
|
||||
self.local_evaluator.for_policy(
|
||||
lambda p: p.set_epsilon(exp_vals[0]))
|
||||
for i, e in enumerate(self.remote_evaluators):
|
||||
exp_val = self.explorations[i].value(self.global_timestep)
|
||||
e.for_policy.remote(lambda p: p.set_epsilon(exp_val))
|
||||
exp_vals.append(exp_val)
|
||||
|
||||
return self._train_stats(start_timestep)
|
||||
|
||||
def _train_stats(self, start_timestep):
|
||||
if self.remote_evaluators:
|
||||
stats = ray.get([
|
||||
e.stats.remote() for e in self.remote_evaluators])
|
||||
else:
|
||||
stats = self.local_evaluator.stats()
|
||||
if not isinstance(stats, list):
|
||||
stats = [stats]
|
||||
|
||||
mean_100ep_reward = 0.0
|
||||
mean_100ep_length = 0.0
|
||||
num_episodes = 0
|
||||
explorations = []
|
||||
|
||||
if self.config["per_worker_exploration"]:
|
||||
# Return stats from workers with the lowest 20% of exploration
|
||||
test_stats = stats[-int(max(1, len(stats)*0.2)):]
|
||||
else:
|
||||
test_stats = stats
|
||||
|
||||
for s in test_stats:
|
||||
mean_100ep_reward += s["mean_100ep_reward"] / len(test_stats)
|
||||
mean_100ep_length += s["mean_100ep_length"] / len(test_stats)
|
||||
|
||||
for s in stats:
|
||||
num_episodes += s["num_episodes"]
|
||||
explorations.append(s["exploration"])
|
||||
|
||||
opt_stats = self.optimizer.stats()
|
||||
|
||||
result = TrainingResult(
|
||||
episode_reward_mean=mean_100ep_reward,
|
||||
episode_len_mean=mean_100ep_length,
|
||||
episodes_total=num_episodes,
|
||||
timesteps_this_iter=self.global_timestep - start_timestep,
|
||||
result = collect_metrics(
|
||||
self.local_evaluator, self.remote_evaluators)
|
||||
return result._replace(
|
||||
info=dict({
|
||||
"min_exploration": min(explorations),
|
||||
"max_exploration": max(explorations),
|
||||
"min_exploration": min(exp_vals),
|
||||
"max_exploration": max(exp_vals),
|
||||
"num_target_updates": self.num_target_updates,
|
||||
}, **opt_stats))
|
||||
|
||||
return result
|
||||
}, **self.optimizer.stats()))
|
||||
|
||||
def _stop(self):
|
||||
# workaround for https://github.com/ray-project/ray/issues/1516
|
||||
@@ -236,10 +216,8 @@ class DQNAgent(Agent):
|
||||
ev.__ray_terminate__.remote()
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
checkpoint_path = self.saver.save(
|
||||
self.local_evaluator.sess,
|
||||
os.path.join(checkpoint_dir, "checkpoint"),
|
||||
global_step=self.iteration)
|
||||
checkpoint_path = os.path.join(
|
||||
checkpoint_dir, "checkpoint-{}".format(self.iteration))
|
||||
extra_data = [
|
||||
self.local_evaluator.save(),
|
||||
ray.get([e.save.remote() for e in self.remote_evaluators]),
|
||||
@@ -250,7 +228,6 @@ class DQNAgent(Agent):
|
||||
return checkpoint_path
|
||||
|
||||
def _restore(self, checkpoint_path):
|
||||
self.saver.restore(self.local_evaluator.sess, checkpoint_path)
|
||||
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
|
||||
self.local_evaluator.restore(extra_data[0])
|
||||
ray.get([
|
||||
@@ -260,6 +237,9 @@ class DQNAgent(Agent):
|
||||
self.num_target_updates = extra_data[3]
|
||||
self.last_target_update_ts = extra_data[4]
|
||||
|
||||
def compute_action(self, observation):
|
||||
return self.local_evaluator.dqn_graph.act(
|
||||
self.local_evaluator.sess, np.array(observation)[None], 0.0)[0]
|
||||
def compute_action(self, observation, state=None):
|
||||
if state is None:
|
||||
state = []
|
||||
return self.local_evaluator.for_policy(
|
||||
lambda p: p.compute_single_action(
|
||||
observation, state, is_training=False)[0])
|
||||
|
||||
@@ -1,207 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Discrete
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.dqn import models
|
||||
from ray.rllib.dqn.common.wrappers import wrap_dqn
|
||||
from ray.rllib.dqn.common.schedules import ConstantSchedule, LinearSchedule
|
||||
from ray.rllib.optimizers import SampleBatch, PolicyEvaluator
|
||||
from ray.rllib.utils.compression import pack
|
||||
|
||||
|
||||
def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
|
||||
"""Rewrites the given trajectory fragments to encode n-step rewards.
|
||||
|
||||
reward[i] = (
|
||||
reward[i] * gamma**0 +
|
||||
reward[i+1] * gamma**1 +
|
||||
... +
|
||||
reward[i+n_step-1] * gamma**(n_step-1))
|
||||
|
||||
The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
|
||||
|
||||
If the episode finishes, the reward will be truncated. After this rewrite,
|
||||
all the arrays will be shortened by (n_step - 1).
|
||||
"""
|
||||
for i in range(len(rewards) - n_step + 1):
|
||||
if dones[i]:
|
||||
continue # episode end
|
||||
for j in range(1, n_step):
|
||||
new_obs[i] = new_obs[i + j]
|
||||
rewards[i] += gamma ** j * rewards[i + j]
|
||||
if dones[i + j]:
|
||||
break # episode end
|
||||
# truncate ends of the trajectory
|
||||
new_len = len(obs) - n_step + 1
|
||||
for arr in [obs, actions, rewards, new_obs, dones]:
|
||||
del arr[new_len:]
|
||||
|
||||
|
||||
class DQNEvaluator(PolicyEvaluator):
|
||||
"""The DQN Evaluator.
|
||||
|
||||
TODO(rliaw): Support observation/reward filters?"""
|
||||
|
||||
def __init__(self, registry, env_creator, config, logdir, worker_index):
|
||||
env = env_creator(config["env_config"])
|
||||
env = wrap_dqn(registry, env, config["model"], config["random_starts"])
|
||||
self.env = env
|
||||
self.config = config
|
||||
|
||||
if not isinstance(env.action_space, Discrete):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DQN.".format(
|
||||
env.action_space))
|
||||
|
||||
tf_config = tf.ConfigProto(**config["tf_session_args"])
|
||||
self.sess = tf.Session(config=tf_config)
|
||||
self.dqn_graph = models.DQNGraph(registry, env, config, logdir)
|
||||
|
||||
# Use either a different `eps` per worker, or a linear schedule.
|
||||
if config["per_worker_exploration"]:
|
||||
assert config["num_workers"] > 1, "This requires multiple workers"
|
||||
self.exploration = ConstantSchedule(
|
||||
0.4 ** (
|
||||
1 + worker_index / float(config["num_workers"] - 1) * 7))
|
||||
else:
|
||||
self.exploration = LinearSchedule(
|
||||
schedule_timesteps=int(
|
||||
config["exploration_fraction"] *
|
||||
config["schedule_max_timesteps"]),
|
||||
initial_p=1.0,
|
||||
final_p=config["exploration_final_eps"])
|
||||
|
||||
# Initialize the parameters and copy them to the target network.
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.dqn_graph.update_target(self.sess)
|
||||
self.global_timestep = 0
|
||||
self.local_timestep = 0
|
||||
|
||||
# Note that this encompasses both the Q and target network
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess)
|
||||
|
||||
self.episode_rewards = [0.0]
|
||||
self.episode_lengths = [0.0]
|
||||
self.saved_mean_reward = None
|
||||
|
||||
self.obs = self.env.reset()
|
||||
|
||||
def set_global_timestep(self, global_timestep):
|
||||
self.global_timestep = global_timestep
|
||||
|
||||
def update_target(self):
|
||||
self.dqn_graph.update_target(self.sess)
|
||||
|
||||
def sample(self):
|
||||
obs, actions, rewards, new_obs, dones = [], [], [], [], []
|
||||
for _ in range(
|
||||
self.config["sample_batch_size"] + self.config["n_step"] - 1):
|
||||
ob, act, rew, ob1, done = self._step(self.global_timestep)
|
||||
obs.append(ob)
|
||||
actions.append(act)
|
||||
rewards.append(rew)
|
||||
new_obs.append(ob1)
|
||||
dones.append(done)
|
||||
|
||||
# N-step Q adjustments
|
||||
if self.config["n_step"] > 1:
|
||||
# Adjust for steps lost from truncation
|
||||
self.local_timestep -= (self.config["n_step"] - 1)
|
||||
adjust_nstep(
|
||||
self.config["n_step"], self.config["gamma"],
|
||||
obs, actions, rewards, new_obs, dones)
|
||||
|
||||
batch = SampleBatch({
|
||||
"obs": [pack(np.array(o)) for o in obs], "actions": actions,
|
||||
"rewards": rewards,
|
||||
"new_obs": [pack(np.array(o)) for o in new_obs], "dones": dones,
|
||||
"weights": np.ones_like(rewards)})
|
||||
assert (batch.count == self.config["sample_batch_size"])
|
||||
|
||||
# Prioritize on the worker side
|
||||
if self.config["worker_side_prioritization"]:
|
||||
td_errors = self.dqn_graph.compute_td_error(
|
||||
self.sess, obs, batch["actions"], batch["rewards"],
|
||||
new_obs, batch["dones"], batch["weights"])
|
||||
new_priorities = (
|
||||
np.abs(td_errors) + self.config["prioritized_replay_eps"])
|
||||
batch.data["weights"] = new_priorities
|
||||
|
||||
return batch
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
td_err, grads = self.dqn_graph.compute_gradients(
|
||||
self.sess, samples["obs"], samples["actions"], samples["rewards"],
|
||||
samples["new_obs"], samples["dones"], samples["weights"])
|
||||
return grads, {"td_error": td_err}
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
self.dqn_graph.apply_gradients(self.sess, grads)
|
||||
|
||||
def compute_apply(self, samples):
|
||||
td_error = self.dqn_graph.compute_apply(
|
||||
self.sess, samples["obs"], samples["actions"], samples["rewards"],
|
||||
samples["new_obs"], samples["dones"], samples["weights"])
|
||||
return {"td_error": td_error}
|
||||
|
||||
def get_weights(self):
|
||||
return self.variables.get_weights()
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
def _step(self, global_timestep):
|
||||
"""Takes a single step, and returns the result of the step."""
|
||||
action = self.dqn_graph.act(
|
||||
self.sess, np.array(self.obs)[None],
|
||||
self.exploration.value(global_timestep))[0]
|
||||
new_obs, rew, done, _ = self.env.step(action)
|
||||
ret = (self.obs, action, rew, new_obs, float(done))
|
||||
self.obs = new_obs
|
||||
self.episode_rewards[-1] += rew
|
||||
self.episode_lengths[-1] += 1
|
||||
if done:
|
||||
self.obs = self.env.reset()
|
||||
self.episode_rewards.append(0.0)
|
||||
self.episode_lengths.append(0.0)
|
||||
self.local_timestep += 1
|
||||
return ret
|
||||
|
||||
def stats(self):
|
||||
n = self.config["smoothing_num_episodes"] + 1
|
||||
mean_100ep_reward = round(np.mean(self.episode_rewards[-n:-1]), 5)
|
||||
mean_100ep_length = round(np.mean(self.episode_lengths[-n:-1]), 5)
|
||||
exploration = self.exploration.value(self.global_timestep)
|
||||
return {
|
||||
"mean_100ep_reward": mean_100ep_reward,
|
||||
"mean_100ep_length": mean_100ep_length,
|
||||
"num_episodes": len(self.episode_rewards),
|
||||
"exploration": exploration,
|
||||
"local_timestep": self.local_timestep,
|
||||
}
|
||||
|
||||
def save(self):
|
||||
return [
|
||||
self.exploration,
|
||||
self.episode_rewards,
|
||||
self.episode_lengths,
|
||||
self.saved_mean_reward,
|
||||
self.obs,
|
||||
self.global_timestep,
|
||||
self.local_timestep]
|
||||
|
||||
def restore(self, data):
|
||||
self.exploration = data[0]
|
||||
self.episode_rewards = data[1]
|
||||
self.episode_lengths = data[2]
|
||||
self.saved_mean_reward = data[3]
|
||||
self.obs = data[4]
|
||||
self.global_timestep = data[5]
|
||||
self.local_timestep = data[6]
|
||||
@@ -2,13 +2,240 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Discrete
|
||||
import numpy as np
|
||||
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.optimizers.multi_gpu_impl import TOWER_SCOPE_NAME
|
||||
from ray.rllib.optimizers.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
|
||||
|
||||
def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
|
||||
"""Rewrites the given trajectory fragments to encode n-step rewards.
|
||||
|
||||
reward[i] = (
|
||||
reward[i] * gamma**0 +
|
||||
reward[i+1] * gamma**1 +
|
||||
... +
|
||||
reward[i+n_step-1] * gamma**(n_step-1))
|
||||
|
||||
The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
|
||||
|
||||
If the episode finishes, the reward will be truncated. After this rewrite,
|
||||
all the arrays will be shortened by (n_step - 1).
|
||||
"""
|
||||
for i in range(len(rewards) - n_step + 1):
|
||||
if dones[i]:
|
||||
continue # episode end
|
||||
for j in range(1, n_step):
|
||||
new_obs[i] = new_obs[i + j]
|
||||
rewards[i] += gamma ** j * rewards[i + j]
|
||||
if dones[i + j]:
|
||||
break # episode end
|
||||
# truncate ends of the trajectory
|
||||
new_len = len(obs) - n_step + 1
|
||||
for arr in [obs, actions, rewards, new_obs, dones]:
|
||||
del arr[new_len:]
|
||||
|
||||
|
||||
class DQNPolicyGraph(TFPolicyGraph):
|
||||
def __init__(self, observation_space, action_space, registry, config):
|
||||
if not isinstance(action_space, Discrete):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DQN.".format(
|
||||
action_space))
|
||||
|
||||
self.config = config
|
||||
self.cur_epsilon = 1.0
|
||||
num_actions = action_space.n
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32, shape=(None,) + observation_space.shape)
|
||||
|
||||
# Action Q network
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_values = _build_q_network(
|
||||
registry, self.cur_observations, num_actions, config)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
self.output_actions = _build_action_network(
|
||||
q_values,
|
||||
self.cur_observations,
|
||||
num_actions,
|
||||
self.stochastic,
|
||||
self.eps)
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
tf.float32, shape=(None,) + observation_space.shape)
|
||||
self.act_t = tf.placeholder(tf.int32, [None], name="action")
|
||||
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
|
||||
self.obs_tp1 = tf.placeholder(
|
||||
tf.float32, shape=(None,) + observation_space.shape)
|
||||
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
|
||||
self.importance_weights = tf.placeholder(
|
||||
tf.float32, [None], name="weight")
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_t = _build_q_network(
|
||||
registry, self.obs_t, num_actions, config)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = _build_q_network(
|
||||
registry, self.obs_tp1, num_actions, config)
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
q_t_selected = tf.reduce_sum(
|
||||
q_t * tf.one_hot(self.act_t, num_actions), 1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if config["double_q"]:
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp1_using_online_net = _build_q_network(
|
||||
registry, self.obs_tp1, num_actions, config)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best = tf.reduce_sum(
|
||||
q_tp1 * tf.one_hot(
|
||||
q_tp1_best_using_online_net, num_actions), 1)
|
||||
else:
|
||||
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
||||
q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = (
|
||||
self.rew_t +
|
||||
config["gamma"] ** config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
self.loss = tf.reduce_mean(
|
||||
self.importance_weights * _huber_loss(self.td_error))
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
update_target_expr = []
|
||||
for var, var_target in zip(
|
||||
sorted(self.q_func_vars, key=lambda v: v.name),
|
||||
sorted(self.target_q_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(var_target.assign(var))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
# initialize TFPolicyGraph
|
||||
self.sess = tf.get_default_session()
|
||||
self.loss_inputs = [
|
||||
("obs", self.obs_t),
|
||||
("actions", self.act_t),
|
||||
("rewards", self.rew_t),
|
||||
("new_obs", self.obs_tp1),
|
||||
("dones", self.done_mask),
|
||||
("weights", self.importance_weights),
|
||||
]
|
||||
self.is_training = tf.placeholder_with_default(True, ())
|
||||
TFPolicyGraph.__init__(
|
||||
self, self.sess, obs_input=self.cur_observations,
|
||||
action_sampler=self.output_actions, loss=self.loss,
|
||||
loss_inputs=self.loss_inputs, is_training=self.is_training)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer(learning_rate=self.config["lr"])
|
||||
|
||||
def gradients(self, optimizer):
|
||||
if self.config["grad_norm_clipping"] is not None:
|
||||
grads_and_vars = _minimize_and_clip(
|
||||
optimizer, self.loss, var_list=self.q_func_vars,
|
||||
clip_val=self.config["grad_norm_clipping"])
|
||||
else:
|
||||
grads_and_vars = optimizer.compute_gradients(
|
||||
self.loss, var_list=self.q_func_vars)
|
||||
grads_and_vars = [
|
||||
(g, v) for (g, v) in grads_and_vars if g is not None]
|
||||
return grads_and_vars
|
||||
|
||||
def extra_compute_action_feed_dict(self):
|
||||
return {
|
||||
self.stochastic: True,
|
||||
self.eps: self.cur_epsilon,
|
||||
}
|
||||
|
||||
def extra_compute_grad_fetches(self):
|
||||
return {
|
||||
"td_error": self.td_error,
|
||||
}
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
return _postprocess_dqn(self, sample_batch)
|
||||
|
||||
def compute_td_error(
|
||||
self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
|
||||
td_err = self.sess.run(
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.obs_t: [np.array(ob) for ob in obs_t],
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
def update_target(self):
|
||||
return self.sess.run(self.update_target_expr)
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
self.cur_epsilon = epsilon
|
||||
|
||||
def get_state(self):
|
||||
return [TFPolicyGraph.get_state(self), self.cur_epsilon]
|
||||
|
||||
def set_state(self, state):
|
||||
TFPolicyGraph.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
|
||||
|
||||
def _postprocess_dqn(policy_graph, sample_batch):
|
||||
obs, actions, rewards, new_obs, dones = [
|
||||
list(x) for x in sample_batch.columns(
|
||||
["obs", "actions", "rewards", "new_obs", "dones"])]
|
||||
|
||||
# N-step Q adjustments
|
||||
if policy_graph.config["n_step"] > 1:
|
||||
adjust_nstep(
|
||||
policy_graph.config["n_step"], policy_graph.config["gamma"],
|
||||
obs, actions, rewards, new_obs, dones)
|
||||
|
||||
batch = SampleBatch({
|
||||
"obs": obs, "actions": actions, "rewards": rewards,
|
||||
"new_obs": new_obs, "dones": dones,
|
||||
"weights": np.ones_like(rewards)})
|
||||
assert batch.count == policy_graph.config["sample_batch_size"], \
|
||||
(batch.count, policy_graph.config["sample_batch_size"])
|
||||
|
||||
# Prioritize on the worker side
|
||||
if policy_graph.config["worker_side_prioritization"]:
|
||||
td_errors = policy_graph.compute_td_error(
|
||||
batch["obs"], batch["actions"], batch["rewards"],
|
||||
batch["new_obs"], batch["dones"], batch["weights"])
|
||||
new_priorities = (
|
||||
np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
|
||||
batch.data["weights"] = new_priorities
|
||||
|
||||
return batch
|
||||
|
||||
|
||||
def _build_q_network(registry, inputs, num_actions, config):
|
||||
@@ -98,205 +325,3 @@ def _scope_vars(scope, trainable_only=False):
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES
|
||||
if trainable_only else tf.GraphKeys.VARIABLES,
|
||||
scope=scope if isinstance(scope, str) else scope.name)
|
||||
|
||||
|
||||
class ModelAndLoss(object):
|
||||
"""Holds the model and loss function.
|
||||
|
||||
Both graphs are necessary in order for the multi-gpu SGD implementation
|
||||
to create towers on each device.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, registry, num_actions, config,
|
||||
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
|
||||
# q network evaluation
|
||||
with tf.variable_scope("q_func", reuse=True):
|
||||
self.q_t = _build_q_network(registry, obs_t, num_actions, config)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope("target_q_func") as scope:
|
||||
self.q_tp1 = _build_q_network(
|
||||
registry, obs_tp1, num_actions, config)
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
q_t_selected = tf.reduce_sum(
|
||||
self.q_t * tf.one_hot(act_t, num_actions), 1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if config["double_q"]:
|
||||
with tf.variable_scope("q_func", reuse=True):
|
||||
q_tp1_using_online_net = _build_q_network(
|
||||
registry, obs_tp1, num_actions, config)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best = tf.reduce_sum(
|
||||
self.q_tp1 * tf.one_hot(
|
||||
q_tp1_best_using_online_net, num_actions), 1)
|
||||
else:
|
||||
q_tp1_best = tf.reduce_max(self.q_tp1, 1)
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = (
|
||||
rew_t + config["gamma"] ** config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
errors = _huber_loss(self.td_error)
|
||||
|
||||
weighted_error = tf.reduce_mean(importance_weights * errors)
|
||||
|
||||
self.loss = weighted_error
|
||||
|
||||
|
||||
class DQNGraph(object):
|
||||
def __init__(self, registry, env, config, logdir):
|
||||
self.env = env
|
||||
num_actions = env.action_space.n
|
||||
optimizer = tf.train.AdamOptimizer(learning_rate=config["lr"])
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32, shape=(None,) + env.observation_space.shape)
|
||||
|
||||
# Action Q network
|
||||
q_scope_name = TOWER_SCOPE_NAME + "/q_func"
|
||||
with tf.variable_scope(q_scope_name) as scope:
|
||||
q_values = _build_q_network(
|
||||
registry, self.cur_observations, num_actions, config)
|
||||
q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
self.output_actions = _build_action_network(
|
||||
q_values,
|
||||
self.cur_observations,
|
||||
num_actions,
|
||||
self.stochastic,
|
||||
self.eps)
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
tf.float32, shape=(None,) + env.observation_space.shape)
|
||||
self.act_t = tf.placeholder(tf.int32, [None], name="action")
|
||||
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
|
||||
self.obs_tp1 = tf.placeholder(
|
||||
tf.float32, shape=(None,) + env.observation_space.shape)
|
||||
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
|
||||
self.importance_weights = tf.placeholder(
|
||||
tf.float32, [None], name="weight")
|
||||
|
||||
def build_loss(
|
||||
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
|
||||
return ModelAndLoss(
|
||||
registry,
|
||||
num_actions, config,
|
||||
obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights)
|
||||
|
||||
self.loss_inputs = [
|
||||
("obs", self.obs_t),
|
||||
("actions", self.act_t),
|
||||
("rewards", self.rew_t),
|
||||
("new_obs", self.obs_tp1),
|
||||
("dones", self.done_mask),
|
||||
("weights", self.importance_weights),
|
||||
]
|
||||
|
||||
with tf.variable_scope(TOWER_SCOPE_NAME):
|
||||
loss_obj = build_loss(
|
||||
self.obs_t, self.act_t, self.rew_t, self.obs_tp1,
|
||||
self.done_mask, self.importance_weights)
|
||||
|
||||
self.build_loss = build_loss
|
||||
|
||||
weighted_error = loss_obj.loss
|
||||
target_q_func_vars = loss_obj.target_q_func_vars
|
||||
self.q_t = loss_obj.q_t
|
||||
self.q_tp1 = loss_obj.q_tp1
|
||||
self.td_error = loss_obj.td_error
|
||||
|
||||
# compute optimization op (potentially with gradient clipping)
|
||||
if config["grad_norm_clipping"] is not None:
|
||||
self.grads_and_vars = _minimize_and_clip(
|
||||
optimizer, weighted_error, var_list=q_func_vars,
|
||||
clip_val=config["grad_norm_clipping"])
|
||||
else:
|
||||
self.grads_and_vars = optimizer.compute_gradients(
|
||||
weighted_error, var_list=q_func_vars)
|
||||
self.grads_and_vars = [
|
||||
(g, v) for (g, v) in self.grads_and_vars if g is not None]
|
||||
self.grads = [g for (g, v) in self.grads_and_vars]
|
||||
self.train_expr = optimizer.apply_gradients(self.grads_and_vars)
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
update_target_expr = []
|
||||
for var, var_target in zip(
|
||||
sorted(q_func_vars, key=lambda v: v.name),
|
||||
sorted(target_q_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(var_target.assign(var))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
def update_target(self, sess):
|
||||
return sess.run(self.update_target_expr)
|
||||
|
||||
def act(self, sess, obs, eps, stochastic=True):
|
||||
return sess.run(
|
||||
self.output_actions,
|
||||
feed_dict={
|
||||
self.cur_observations: obs,
|
||||
self.stochastic: stochastic,
|
||||
self.eps: eps,
|
||||
})
|
||||
|
||||
def compute_gradients(
|
||||
self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err, grads = sess.run(
|
||||
[self.td_error, self.grads],
|
||||
feed_dict={
|
||||
self.obs_t: obs_t,
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: obs_tp1,
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err, grads
|
||||
|
||||
def compute_td_error(
|
||||
self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = sess.run(
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.obs_t: [np.array(ob) for ob in obs_t],
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: [np.array(ob) for ob in obs_tp1],
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
def apply_gradients(self, sess, grads):
|
||||
assert len(grads) == len(self.grads_and_vars)
|
||||
feed_dict = {ph: g for (g, ph) in zip(grads, self.grads)}
|
||||
sess.run(self.train_expr, feed_dict=feed_dict)
|
||||
|
||||
def compute_apply(
|
||||
self, sess, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err, _ = sess.run(
|
||||
[self.td_error, self.train_expr],
|
||||
feed_dict={
|
||||
self.obs_t: obs_t,
|
||||
self.act_t: act_t,
|
||||
self.rew_t: rew_t,
|
||||
self.obs_tp1: obs_tp1,
|
||||
self.done_mask: done_mask,
|
||||
self.importance_weights: importance_weights
|
||||
})
|
||||
return td_err
|
||||
@@ -35,8 +35,8 @@ class LSTM(Model):
|
||||
lstm = rnn.rnn_cell.BasicLSTMCell(size, state_is_tuple=True)
|
||||
step_size = tf.shape(self.x)[:1]
|
||||
|
||||
c_init = np.zeros((1, lstm.state_size.c), np.float32)
|
||||
h_init = np.zeros((1, lstm.state_size.h), np.float32)
|
||||
c_init = np.zeros(lstm.state_size.c, np.float32)
|
||||
h_init = np.zeros(lstm.state_size.h, np.float32)
|
||||
self.state_init = [c_init, h_init]
|
||||
c_in = tf.placeholder(tf.float32, [1, lstm.state_size.c])
|
||||
h_in = tf.placeholder(tf.float32, [1, lstm.state_size.h])
|
||||
|
||||
@@ -7,18 +7,14 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def convert_batch(trajectory, has_features=False):
|
||||
def convert_batch(trajectory):
|
||||
"""Convert trajectory from numpy to PT variable"""
|
||||
states = torch.from_numpy(trajectory["obs"]).float()
|
||||
acs = torch.from_numpy(trajectory["actions"])
|
||||
advs = torch.from_numpy(
|
||||
trajectory["advantages"].copy()).float().reshape(-1)
|
||||
rs = torch.from_numpy(trajectory["rewards"]).float().reshape(-1)
|
||||
if has_features:
|
||||
features = [torch.from_numpy(f) for f in trajectory["features"]]
|
||||
else:
|
||||
features = trajectory["features"]
|
||||
return states, acs, advs, rs, features
|
||||
return states, acs, advs, rs
|
||||
|
||||
|
||||
def var_to_np(var):
|
||||
|
||||
@@ -43,7 +43,7 @@ class LocalSyncParallelOptimizer(object):
|
||||
processed.
|
||||
build_loss: Function that takes the specified inputs and returns an
|
||||
object with a 'loss' property that is a scalar Tensor. For example,
|
||||
ray.rllib.ppo.ProximalPolicyLoss.
|
||||
ray.rllib.ppo.ProximalPolicyGraph.
|
||||
logdir: Directory to place debugging output in.
|
||||
grad_norm_clipping: None or int stdev to clip grad norms by
|
||||
"""
|
||||
|
||||
@@ -38,18 +38,24 @@ class PolicyOptimizer(object):
|
||||
|
||||
Args:
|
||||
evaluator_cls (class): Python class of the evaluators to create.
|
||||
evaluator_args (list): List of constructor args for the evaluators.
|
||||
evaluator_args (list|dict): Constructor args for the evaluators.
|
||||
num_workers (int): Number of remote evaluators to create in
|
||||
addition to a local evaluator. This can be zero or greater.
|
||||
optimizer_config (dict): Keyword arguments to pass to the
|
||||
optimizer class constructor.
|
||||
"""
|
||||
|
||||
local_evaluator = evaluator_cls(*evaluator_args)
|
||||
remote_cls = ray.remote(**evaluator_resources)(evaluator_cls)
|
||||
remote_evaluators = [
|
||||
remote_cls.remote(*evaluator_args)
|
||||
for _ in range(num_workers)]
|
||||
if isinstance(evaluator_args, list):
|
||||
local_evaluator = evaluator_cls(*evaluator_args)
|
||||
remote_evaluators = [
|
||||
remote_cls.remote(*evaluator_args)
|
||||
for _ in range(num_workers)]
|
||||
else:
|
||||
local_evaluator = evaluator_cls(**evaluator_args)
|
||||
remote_evaluators = [
|
||||
remote_cls.remote(**evaluator_args)
|
||||
for _ in range(num_workers)]
|
||||
return cls(optimizer_config, local_evaluator, remote_evaluators)
|
||||
|
||||
def __init__(self, config, local_evaluator, remote_evaluators):
|
||||
|
||||
@@ -2,17 +2,22 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
import numpy as np
|
||||
|
||||
|
||||
def arrayify(s):
|
||||
if type(s) in [int, float, str, np.ndarray]:
|
||||
return s
|
||||
elif type(s) is list:
|
||||
# recursive call to convert LazyFrames to arrays
|
||||
return np.array([arrayify(x) for x in s])
|
||||
else:
|
||||
return np.array(s)
|
||||
class SampleBatchBuilder(object):
|
||||
"""Util to build a SampleBatch incrementally."""
|
||||
|
||||
def __init__(self):
|
||||
self.buffers = collections.defaultdict(list)
|
||||
|
||||
def add_values(self, **values):
|
||||
for k, v in values.items():
|
||||
self.buffers[k].append(v)
|
||||
|
||||
def build(self):
|
||||
return SampleBatch({k: np.array(v) for k, v in self.buffers.items()})
|
||||
|
||||
|
||||
class SampleBatch(object):
|
||||
|
||||
+23
-32
@@ -2,13 +2,11 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray.rllib.optimizers import LocalSyncOptimizer
|
||||
from ray.rllib.pg.pg_evaluator import PGEvaluator
|
||||
from ray.rllib.agent import Agent
|
||||
from ray.tune.result import TrainingResult
|
||||
from ray.rllib.optimizers import LocalSyncOptimizer
|
||||
from ray.rllib.pg.pg_policy_graph import PGPolicyGraph
|
||||
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \
|
||||
collect_metrics
|
||||
from ray.tune.trial import Resources
|
||||
|
||||
|
||||
@@ -33,7 +31,6 @@ DEFAULT_CONFIG = {
|
||||
|
||||
|
||||
class PGAgent(Agent):
|
||||
|
||||
"""Simple policy gradient agent.
|
||||
|
||||
This is an example agent to show how to implement algorithms in RLlib.
|
||||
@@ -50,34 +47,28 @@ class PGAgent(Agent):
|
||||
|
||||
def _init(self):
|
||||
self.optimizer = LocalSyncOptimizer.make(
|
||||
evaluator_cls=PGEvaluator,
|
||||
evaluator_args=[self.registry, self.env_creator, self.config],
|
||||
evaluator_cls=CommonPolicyEvaluator,
|
||||
evaluator_args={
|
||||
"env_creator": self.env_creator,
|
||||
"policy_graph": PGPolicyGraph,
|
||||
"batch_steps": self.config["batch_size"],
|
||||
"batch_mode": "truncate_episodes",
|
||||
"registry": self.registry,
|
||||
"model_config": self.config["model"],
|
||||
"env_config": self.config["env_config"],
|
||||
"policy_config": self.config,
|
||||
},
|
||||
num_workers=self.config["num_workers"],
|
||||
optimizer_config=self.config["optimizer"])
|
||||
|
||||
def _train(self):
|
||||
self.optimizer.step()
|
||||
return collect_metrics(
|
||||
self.optimizer.local_evaluator, self.optimizer.remote_evaluators)
|
||||
|
||||
episode_rewards = []
|
||||
episode_lengths = []
|
||||
metric_lists = [a.get_completed_rollout_metrics.remote()
|
||||
for a in self.optimizer.remote_evaluators]
|
||||
for metrics in metric_lists:
|
||||
for episode in ray.get(metrics):
|
||||
episode_lengths.append(episode.episode_length)
|
||||
episode_rewards.append(episode.episode_reward)
|
||||
avg_reward = np.mean(episode_rewards)
|
||||
avg_length = np.mean(episode_lengths)
|
||||
timesteps = np.sum(episode_lengths)
|
||||
|
||||
result = TrainingResult(
|
||||
episode_reward_mean=avg_reward,
|
||||
episode_len_mean=avg_length,
|
||||
timesteps_this_iter=timesteps,
|
||||
info={})
|
||||
|
||||
return result
|
||||
|
||||
def compute_action(self, obs):
|
||||
action, info = self.optimizer.local_evaluator.policy.compute(obs)
|
||||
return action
|
||||
def compute_action(self, observation, state=None):
|
||||
if state is None:
|
||||
state = []
|
||||
return self.local_evaluator.for_policy(
|
||||
lambda p: p.compute_single_action(
|
||||
observation, state, is_training=False)[0])
|
||||
|
||||
@@ -1,56 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.optimizers import PolicyEvaluator
|
||||
from ray.rllib.pg.policy import PGPolicy
|
||||
from ray.rllib.utils.filter import NoFilter
|
||||
from ray.rllib.utils.process_rollout import process_rollout
|
||||
from ray.rllib.utils.sampler import SyncSampler
|
||||
|
||||
|
||||
class PGEvaluator(PolicyEvaluator):
|
||||
"""Evaluator for simple policy gradient."""
|
||||
|
||||
def __init__(self, registry, env_creator, config):
|
||||
self.env = ModelCatalog.get_preprocessor_as_wrapper(
|
||||
registry, env_creator(config["env_config"]), config["model"])
|
||||
self.config = config
|
||||
|
||||
self.policy = PGPolicy(registry, self.env.observation_space,
|
||||
self.env.action_space, config)
|
||||
self.sampler = SyncSampler(
|
||||
self.env, self.policy, NoFilter(),
|
||||
config["batch_size"], horizon=config["horizon"])
|
||||
|
||||
def sample(self):
|
||||
rollout = self.sampler.get_data()
|
||||
samples = process_rollout(
|
||||
rollout, NoFilter(),
|
||||
gamma=self.config["gamma"], use_gae=False)
|
||||
return samples
|
||||
|
||||
def get_completed_rollout_metrics(self):
|
||||
"""Returns metrics on previously completed rollouts.
|
||||
|
||||
Calling this clears the queue of completed rollout metrics.
|
||||
"""
|
||||
return self.sampler.get_metrics()
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
""" Returns gradient w.r.t. samples."""
|
||||
gradient, info = self.policy.compute_gradients(samples)
|
||||
return gradient, {}
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
"""Applies gradients to evaluator weights."""
|
||||
self.policy.apply_gradients(grads)
|
||||
|
||||
def get_weights(self):
|
||||
"""Returns model weights."""
|
||||
return self.policy.get_weights()
|
||||
|
||||
def set_weights(self, weights):
|
||||
"""Sets model weights."""
|
||||
return self.policy.set_weights(weights)
|
||||
@@ -0,0 +1,45 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.utils.process_rollout import compute_advantages
|
||||
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
|
||||
class PGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
def __init__(self, obs_space, action_space, registry, config):
|
||||
self.config = config
|
||||
|
||||
# setup policy
|
||||
self.x = tf.placeholder(tf.float32, shape=[None]+list(obs_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(action_space)
|
||||
self.model = ModelCatalog.get_model(
|
||||
registry, self.x, self.logit_dim, options=self.config["model"])
|
||||
self.dist = dist_class(self.model.outputs) # logit for each action
|
||||
|
||||
# setup policy loss
|
||||
self.ac = ModelCatalog.get_action_placeholder(action_space)
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
self.loss = -tf.reduce_mean(self.dist.logp(self.ac) * self.adv)
|
||||
|
||||
# initialize TFPolicyGraph
|
||||
self.sess = tf.get_default_session()
|
||||
self.loss_in = [
|
||||
("obs", self.x),
|
||||
("actions", self.ac),
|
||||
("advantages", self.adv),
|
||||
]
|
||||
self.is_training = tf.placeholder_with_default(True, ())
|
||||
TFPolicyGraph.__init__(
|
||||
self, self.sess, obs_input=self.x,
|
||||
action_sampler=self.dist.sample(), loss=self.loss,
|
||||
loss_inputs=self.loss_in, is_training=self.is_training)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
return compute_advantages(
|
||||
sample_batch, 0.0, self.config["gamma"], use_gae=False)
|
||||
@@ -1,82 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
|
||||
class PGPolicy():
|
||||
|
||||
other_output = []
|
||||
is_recurrent = False
|
||||
|
||||
def __init__(self, registry, ob_space, ac_space, config):
|
||||
self.config = config
|
||||
self.registry = registry
|
||||
with tf.variable_scope("local"):
|
||||
self._setup_graph(ob_space, ac_space)
|
||||
print("Setting up loss")
|
||||
self._setup_loss(ac_space)
|
||||
self._setup_gradients()
|
||||
self.initialize()
|
||||
|
||||
def _setup_graph(self, ob_space, ac_space):
|
||||
self.x = tf.placeholder(tf.float32, shape=[None]+list(ob_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(ac_space)
|
||||
self.model = ModelCatalog.get_model(
|
||||
self.registry, self.x, self.logit_dim,
|
||||
options=self.config["model"])
|
||||
self.action_logits = self.model.outputs # logit for each action
|
||||
self.dist = dist_class(self.action_logits)
|
||||
self.sample = self.dist.sample()
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
|
||||
def _setup_loss(self, action_space):
|
||||
self.ac = ModelCatalog.get_action_placeholder(action_space)
|
||||
self.adv = tf.placeholder(tf.float32, [None], name="adv")
|
||||
|
||||
log_prob = self.dist.logp(self.ac)
|
||||
|
||||
# policy loss
|
||||
self.loss = -tf.reduce_mean(log_prob * self.adv)
|
||||
|
||||
def _setup_gradients(self):
|
||||
self.grads = tf.gradients(self.loss, self.var_list)
|
||||
grads_and_vars = list(zip(self.grads, self.var_list))
|
||||
opt = tf.train.AdamOptimizer(self.config["lr"])
|
||||
self._apply_gradients = opt.apply_gradients(grads_and_vars)
|
||||
|
||||
def initialize(self):
|
||||
self.sess = tf.Session()
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
self.loss, self.sess)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
info = {}
|
||||
feed_dict = {
|
||||
self.x: samples["obs"],
|
||||
self.ac: samples["actions"],
|
||||
self.adv: samples["advantages"],
|
||||
}
|
||||
self.grads = [g for g in self.grads if g is not None]
|
||||
grad = self.sess.run(self.grads, feed_dict=feed_dict)
|
||||
return grad, info
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
feed_dict = dict(zip(self.grads, grads))
|
||||
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
|
||||
|
||||
def get_weights(self):
|
||||
return self.variables.get_weights()
|
||||
|
||||
def set_weights(self, weights):
|
||||
self.variables.set_weights(weights)
|
||||
|
||||
def compute(self, ob, *args):
|
||||
action = self.sess.run(self.sample, {self.x: [ob]})
|
||||
return action[0], {}
|
||||
@@ -7,7 +7,7 @@ import tensorflow as tf
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
|
||||
class ProximalPolicyLoss(object):
|
||||
class ProximalPolicyGraph(object):
|
||||
|
||||
other_output = ["vf_preds", "logprobs"]
|
||||
is_recurrent = False
|
||||
@@ -82,11 +82,14 @@ class ProximalPolicyLoss(object):
|
||||
self.policy_results = [
|
||||
self.sampler, self.curr_logits, tf.constant("NA")]
|
||||
|
||||
def compute(self, observation):
|
||||
def compute_single_action(self, observation, features, is_training=False):
|
||||
action, logprobs, vf = self.sess.run(
|
||||
self.policy_results,
|
||||
feed_dict={self.observations: [observation]})
|
||||
return action[0], {"vf_preds": vf[0], "logprobs": logprobs[0]}
|
||||
return action[0], [], {"vf_preds": vf[0], "logprobs": logprobs[0]}
|
||||
|
||||
def get_initial_state(self):
|
||||
return []
|
||||
|
||||
def loss(self):
|
||||
return self.loss
|
||||
|
||||
@@ -172,7 +172,7 @@ class PPOAgent(Agent):
|
||||
batch_index = 0
|
||||
num_batches = (
|
||||
int(tuples_per_device) // int(model.per_device_batch_size))
|
||||
loss, policy_loss, vf_loss, kl, entropy = [], [], [], [], []
|
||||
loss, policy_graph, vf_loss, kl, entropy = [], [], [], [], []
|
||||
permutation = np.random.permutation(num_batches)
|
||||
# Prepare to drop into the debugger
|
||||
if self.iteration == config["tf_debug_iteration"]:
|
||||
@@ -181,26 +181,26 @@ class PPOAgent(Agent):
|
||||
full_trace = (
|
||||
i == 0 and self.iteration == 0 and
|
||||
batch_index == config["full_trace_nth_sgd_batch"])
|
||||
batch_loss, batch_policy_loss, batch_vf_loss, batch_kl, \
|
||||
batch_loss, batch_policy_graph, batch_vf_loss, batch_kl, \
|
||||
batch_entropy = model.run_sgd_minibatch(
|
||||
permutation[batch_index] * model.per_device_batch_size,
|
||||
self.kl_coeff, full_trace,
|
||||
self.file_writer)
|
||||
loss.append(batch_loss)
|
||||
policy_loss.append(batch_policy_loss)
|
||||
policy_graph.append(batch_policy_graph)
|
||||
vf_loss.append(batch_vf_loss)
|
||||
kl.append(batch_kl)
|
||||
entropy.append(batch_entropy)
|
||||
batch_index += 1
|
||||
loss = np.mean(loss)
|
||||
policy_loss = np.mean(policy_loss)
|
||||
policy_graph = np.mean(policy_graph)
|
||||
vf_loss = np.mean(vf_loss)
|
||||
kl = np.mean(kl)
|
||||
entropy = np.mean(entropy)
|
||||
sgd_end = time.time()
|
||||
print(
|
||||
"{:>15}{:15.5e}{:15.5e}{:15.5e}{:15.5e}{:15.5e}".format(
|
||||
i, loss, policy_loss, vf_loss, kl, entropy))
|
||||
i, loss, policy_graph, vf_loss, kl, entropy))
|
||||
|
||||
values = []
|
||||
if i == config["num_sgd_iter"] - 1:
|
||||
@@ -299,4 +299,5 @@ class PPOAgent(Agent):
|
||||
def compute_action(self, observation):
|
||||
observation = self.local_evaluator.obs_filter(
|
||||
observation, update=False)
|
||||
return self.local_evaluator.common_policy.compute(observation)[0]
|
||||
return self.local_evaluator.common_policy.compute_single_action(
|
||||
observation, [], False)[0]
|
||||
|
||||
@@ -16,8 +16,8 @@ from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.sampler import SyncSampler
|
||||
from ray.rllib.utils.filter import get_filter, MeanStdFilter
|
||||
from ray.rllib.utils.process_rollout import process_rollout
|
||||
from ray.rllib.ppo.loss import ProximalPolicyLoss
|
||||
from ray.rllib.utils.process_rollout import compute_advantages
|
||||
from ray.rllib.ppo.loss import ProximalPolicyGraph
|
||||
|
||||
|
||||
# TODO(rliaw): Move this onto LocalMultiGPUOptimizer
|
||||
@@ -86,7 +86,7 @@ class PPOEvaluator(PolicyEvaluator):
|
||||
self.per_device_batch_size = int(self.batch_size / len(devices))
|
||||
|
||||
def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
|
||||
return ProximalPolicyLoss(
|
||||
return ProximalPolicyGraph(
|
||||
self.env.observation_space, self.env.action_space,
|
||||
obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
|
||||
self.kl_coeff, self.distribution_class, self.config,
|
||||
@@ -190,8 +190,9 @@ class PPOEvaluator(PolicyEvaluator):
|
||||
|
||||
while num_steps_so_far < self.config["min_steps_per_task"]:
|
||||
rollout = self.sampler.get_data()
|
||||
samples = process_rollout(
|
||||
rollout, self.rew_filter, self.config["gamma"],
|
||||
last_r = 0.0 # note: not needed since we don't truncate rollouts
|
||||
samples = compute_advantages(
|
||||
rollout, last_r, self.config["gamma"],
|
||||
self.config["lambda"], use_gae=self.config["use_gae"])
|
||||
num_steps_so_far += samples.count
|
||||
all_samples.append(samples)
|
||||
|
||||
@@ -17,18 +17,19 @@ def get_mean_action(alg, obs):
|
||||
return np.mean(out)
|
||||
|
||||
|
||||
ray.init()
|
||||
ray.init(num_cpus=10)
|
||||
|
||||
CONFIGS = {
|
||||
"ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100},
|
||||
"ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
|
||||
"num_workers": 2},
|
||||
"DQN": {},
|
||||
"DDPG": {"noise_scale": 0.0},
|
||||
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000},
|
||||
"A3C": {"use_lstm": False},
|
||||
"DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
|
||||
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
|
||||
"A3C": {"use_lstm": False, "num_workers": 1},
|
||||
}
|
||||
|
||||
|
||||
def test(use_object_store, alg_name):
|
||||
def test(use_object_store, alg_name, failures):
|
||||
cls = get_agent_class(alg_name)
|
||||
if alg_name == "DDPG":
|
||||
alg1 = cls(config=CONFIGS[name], env="Pendulum-v0")
|
||||
@@ -55,12 +56,15 @@ def test(use_object_store, alg_name):
|
||||
a1 = get_mean_action(alg1, obs)
|
||||
a2 = get_mean_action(alg2, obs)
|
||||
print("Checking computed actions", alg1, obs, a1, a2)
|
||||
assert abs(a1 - a2) < .1, (a1, a2)
|
||||
if abs(a1 - a2) > .1:
|
||||
failures.append((alg_name, [a1, a2]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
failures = []
|
||||
for use_object_store in [False, True]:
|
||||
for name in ["ES", "DQN", "DDPG", "PPO", "A3C"]:
|
||||
test(use_object_store, name)
|
||||
test(use_object_store, name, failures)
|
||||
|
||||
assert not failures, failures
|
||||
print("All checkpoint restore tests passed!")
|
||||
|
||||
@@ -0,0 +1,133 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import time
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator
|
||||
from ray.rllib.utils.policy_graph import PolicyGraph
|
||||
from ray.rllib.utils.process_rollout import compute_advantages
|
||||
|
||||
|
||||
class MockPolicyGraph(PolicyGraph):
|
||||
def compute_actions(self, obs_batch, state_batches, is_training=False):
|
||||
return [0] * len(obs_batch), [], {}
|
||||
|
||||
def postprocess_trajectory(self, batch):
|
||||
return compute_advantages(batch, 100.0, 0.9, use_gae=False)
|
||||
|
||||
|
||||
class TestCommonPolicyEvaluator(unittest.TestCase):
|
||||
def testBasic(self):
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph)
|
||||
batch = ev.sample()
|
||||
for key in ["obs", "actions", "rewards", "dones", "advantages"]:
|
||||
self.assertIn(key, batch)
|
||||
self.assertGreater(batch["advantages"][0], 1)
|
||||
|
||||
def testPackEpisodes(self):
|
||||
for batch_size in [1, 10, 100, 1000]:
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_steps=batch_size,
|
||||
batch_mode="pack_episodes")
|
||||
batch = ev.sample()
|
||||
self.assertEqual(batch.count, batch_size)
|
||||
|
||||
def testTruncateEpisodes(self):
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_steps=2,
|
||||
batch_mode="truncate_episodes")
|
||||
batch = ev.sample()
|
||||
self.assertEqual(batch.count, 2)
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_steps=1000,
|
||||
batch_mode="truncate_episodes")
|
||||
self.assertLess(batch.count, 200)
|
||||
|
||||
def testCompleteEpisodes(self):
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_steps=2,
|
||||
batch_mode="complete_episodes")
|
||||
batch = ev.sample()
|
||||
self.assertGreater(batch.count, 2)
|
||||
self.assertTrue(batch["dones"][-1])
|
||||
batch = ev.sample()
|
||||
self.assertGreater(batch.count, 2)
|
||||
self.assertTrue(batch["dones"][-1])
|
||||
|
||||
def testFilterSync(self):
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
sample_async=True,
|
||||
observation_filter="ConcurrentMeanStdFilter")
|
||||
time.sleep(2)
|
||||
ev.sample()
|
||||
filters = ev.get_filters(flush_after=True)
|
||||
obs_f = filters["obs_filter"]
|
||||
self.assertNotEqual(obs_f.rs.n, 0)
|
||||
self.assertNotEqual(obs_f.buffer.n, 0)
|
||||
|
||||
def testGetFilters(self):
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
sample_async=True,
|
||||
observation_filter="ConcurrentMeanStdFilter")
|
||||
self.sample_and_flush(ev)
|
||||
filters = ev.get_filters(flush_after=False)
|
||||
time.sleep(2)
|
||||
filters2 = ev.get_filters(flush_after=False)
|
||||
obs_f = filters["obs_filter"]
|
||||
obs_f2 = filters2["obs_filter"]
|
||||
self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
|
||||
self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
|
||||
|
||||
def testSyncFilter(self):
|
||||
ev = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph,
|
||||
sample_async=True,
|
||||
observation_filter="ConcurrentMeanStdFilter")
|
||||
obs_f = self.sample_and_flush(ev)
|
||||
|
||||
# Current State
|
||||
filters = ev.get_filters(flush_after=False)
|
||||
obs_f = filters["obs_filter"]
|
||||
|
||||
self.assertLessEqual(obs_f.buffer.n, 20)
|
||||
|
||||
new_obsf = obs_f.copy()
|
||||
new_obsf.rs._n = 100
|
||||
ev.sync_filters({"obs_filter": new_obsf})
|
||||
filters = ev.get_filters(flush_after=False)
|
||||
obs_f = filters["obs_filter"]
|
||||
self.assertGreaterEqual(obs_f.rs.n, 100)
|
||||
self.assertLessEqual(obs_f.buffer.n, 20)
|
||||
|
||||
def sample_and_flush(self, ev):
|
||||
time.sleep(2)
|
||||
ev.sample()
|
||||
filters = ev.get_filters(flush_after=True)
|
||||
obs_f = filters["obs_filter"]
|
||||
self.assertNotEqual(obs_f.rs.n, 0)
|
||||
self.assertNotEqual(obs_f.buffer.n, 0)
|
||||
return obs_f
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ray.init()
|
||||
unittest.main(verbosity=2)
|
||||
@@ -3,19 +3,11 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import gym
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.rllib.a3c import DEFAULT_CONFIG
|
||||
from ray.rllib.a3c.a3c_evaluator import A3CEvaluator
|
||||
from ray.rllib.dqn.dqn_evaluator import adjust_nstep
|
||||
from ray.tune.registry import get_registry
|
||||
from ray.rllib.dqn.dqn_policy_graph import adjust_nstep
|
||||
|
||||
|
||||
class DQNEvaluatorTest(unittest.TestCase):
|
||||
class DQNTest(unittest.TestCase):
|
||||
def testNStep(self):
|
||||
obs = [1, 2, 3, 4, 5, 6, 7]
|
||||
actions = ["a", "b", "a", "a", "a", "b", "a"]
|
||||
@@ -30,70 +22,5 @@ class DQNEvaluatorTest(unittest.TestCase):
|
||||
self.assertEqual(dones, [1, 0, 0, 0, 0])
|
||||
|
||||
|
||||
class A3CEvaluatorTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
ray.init(num_cpus=1)
|
||||
config = DEFAULT_CONFIG.copy()
|
||||
config["num_workers"] = 1
|
||||
config["observation_filter"] = "ConcurrentMeanStdFilter"
|
||||
config["reward_filter"] = "MeanStdFilter"
|
||||
config["batch_size"] = 2
|
||||
self._temp_dir = tempfile.mkdtemp("a3c_evaluator_test")
|
||||
self.e = A3CEvaluator(
|
||||
get_registry(),
|
||||
lambda config: gym.make("CartPole-v0"),
|
||||
config,
|
||||
logdir=self._temp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
ray.worker.cleanup()
|
||||
shutil.rmtree(self._temp_dir)
|
||||
|
||||
def sample_and_flush(self):
|
||||
e = self.e
|
||||
time.sleep(2)
|
||||
self.e.sample()
|
||||
filters = e.get_filters(flush_after=True)
|
||||
obs_f = filters["obs_filter"]
|
||||
rew_f = filters["rew_filter"]
|
||||
self.assertNotEqual(obs_f.rs.n, 0)
|
||||
self.assertNotEqual(obs_f.buffer.n, 0)
|
||||
self.assertNotEqual(rew_f.rs.n, 0)
|
||||
self.assertNotEqual(rew_f.buffer.n, 0)
|
||||
return obs_f, rew_f
|
||||
|
||||
def testGetFilters(self):
|
||||
"""Show `flush_after=False` provides does not affect the buffer."""
|
||||
e = self.e
|
||||
self.sample_and_flush()
|
||||
filters = e.get_filters(flush_after=False)
|
||||
obs_f = filters["obs_filter"]
|
||||
filters2 = e.get_filters(flush_after=False)
|
||||
obs_f2 = filters2["obs_filter"]
|
||||
self.assertGreaterEqual(obs_f2.rs.n, obs_f.rs.n)
|
||||
self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n)
|
||||
|
||||
def testSyncFilter(self):
|
||||
"""Show that sync_filters rebases own buffer over input"""
|
||||
e = self.e
|
||||
obs_f, _ = self.sample_and_flush()
|
||||
|
||||
# Current State
|
||||
filters = e.get_filters(flush_after=False)
|
||||
obs_f = filters["obs_filter"]
|
||||
rew_f = filters["rew_filter"]
|
||||
|
||||
self.assertLessEqual(obs_f.buffer.n, 20)
|
||||
|
||||
new_obsf = obs_f.copy()
|
||||
new_obsf.rs._n = 100
|
||||
e.sync_filters({"obs_filter": new_obsf, "rew_filter": rew_f})
|
||||
filters = e.get_filters(flush_after=False)
|
||||
obs_f = filters["obs_filter"]
|
||||
self.assertGreaterEqual(obs_f.rs.n, 100)
|
||||
self.assertLessEqual(obs_f.buffer.n, 20)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
@@ -36,32 +36,6 @@ OBSERVATION_SPACES_TO_TEST = {
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
}
|
||||
|
||||
# (alg, action_space, obs_space)
|
||||
KNOWN_FAILURES = [
|
||||
# TODO(ekl) multiagent support for a3c
|
||||
("A3C", "implicit_tuple", "atari"),
|
||||
("A3C", "implicit_tuple", "atari_ram"),
|
||||
("A3C", "implicit_tuple", "discrete"),
|
||||
("A3C", "implicit_tuple", "image"),
|
||||
("A3C", "implicit_tuple", "mixed_tuple"),
|
||||
("A3C", "implicit_tuple", "simple_tuple"),
|
||||
("A3C", "implicit_tuple", "vector"),
|
||||
("A3C", "mixed_tuple", "atari"),
|
||||
("A3C", "mixed_tuple", "atari_ram"),
|
||||
("A3C", "mixed_tuple", "discrete"),
|
||||
("A3C", "mixed_tuple", "image"),
|
||||
("A3C", "mixed_tuple", "mixed_tuple"),
|
||||
("A3C", "mixed_tuple", "simple_tuple"),
|
||||
("A3C", "mixed_tuple", "vector"),
|
||||
("A3C", "simple_tuple", "atari"),
|
||||
("A3C", "simple_tuple", "atari_ram"),
|
||||
("A3C", "simple_tuple", "discrete"),
|
||||
("A3C", "simple_tuple", "image"),
|
||||
("A3C", "simple_tuple", "mixed_tuple"),
|
||||
("A3C", "simple_tuple", "simple_tuple"),
|
||||
("A3C", "simple_tuple", "vector"),
|
||||
]
|
||||
|
||||
|
||||
def make_stub_env(action_space, obs_space):
|
||||
class StubEnv(gym.Env):
|
||||
@@ -135,19 +109,13 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
{"num_workers": 1, "optimizer": {}},
|
||||
stats)
|
||||
num_unexpected_errors = 0
|
||||
num_unexpected_success = 0
|
||||
for (alg, a_name, o_name), stat in sorted(stats.items()):
|
||||
if stat in ["ok", "unsupported"]:
|
||||
if (alg, a_name, o_name) in KNOWN_FAILURES:
|
||||
num_unexpected_success += 1
|
||||
else:
|
||||
if (alg, a_name, o_name) not in KNOWN_FAILURES:
|
||||
num_unexpected_errors += 1
|
||||
if stat not in ["ok", "unsupported"]:
|
||||
num_unexpected_errors += 1
|
||||
print(
|
||||
alg, "action_space", a_name, "obs_space", o_name,
|
||||
"result", stat)
|
||||
self.assertEqual(num_unexpected_errors, 0)
|
||||
self.assertEqual(num_unexpected_success, 0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -13,7 +13,6 @@ mountaincarcontinuous-ddpg:
|
||||
tau: 0.01
|
||||
l2_reg: 0.00001
|
||||
buffer_size: 50000
|
||||
random_starts: False
|
||||
clip_rewards: False
|
||||
learning_starts: 1000
|
||||
#model:
|
||||
|
||||
@@ -6,6 +6,5 @@ pendulum-ddpg:
|
||||
episode_reward_mean: -160
|
||||
config:
|
||||
use_huber: True
|
||||
random_starts: False
|
||||
clip_rewards: False
|
||||
exploration_fraction: 0.1
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
cartpole-a3c:
|
||||
env: CartPole-v0
|
||||
run: A3C
|
||||
stop:
|
||||
episode_reward_mean: 200
|
||||
time_total_s: 600
|
||||
config:
|
||||
num_workers: 1
|
||||
gamma: 0.95
|
||||
use_pytorch: true
|
||||
@@ -5,5 +5,5 @@ cartpole-a3c:
|
||||
episode_reward_mean: 200
|
||||
time_total_s: 600
|
||||
config:
|
||||
num_workers: 4
|
||||
num_workers: 1
|
||||
gamma: 0.95
|
||||
|
||||
@@ -7,4 +7,3 @@ cartpole-dqn:
|
||||
config:
|
||||
n_step: 3
|
||||
gamma: 0.95
|
||||
smoothing_num_episodes: 10
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
cartpole-pg:
|
||||
env: CartPole-v0
|
||||
run: PG
|
||||
stop:
|
||||
episode_reward_mean: 200
|
||||
time_total_s: 300
|
||||
config:
|
||||
num_workers: 1
|
||||
@@ -6,7 +6,5 @@ pendulum-ddpg:
|
||||
time_total_s: 900
|
||||
config:
|
||||
use_huber: True
|
||||
random_starts: False
|
||||
clip_rewards: False
|
||||
exploration_fraction: 0.1
|
||||
smoothing_num_episodes: 10
|
||||
|
||||
@@ -0,0 +1,278 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pickle
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.optimizers.policy_evaluator import PolicyEvaluator
|
||||
from ray.rllib.utils.atari_wrappers import wrap_deepmind
|
||||
from ray.rllib.utils.compression import pack
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.utils.sampler import AsyncSampler, SyncSampler
|
||||
from ray.rllib.utils.tf_policy_graph import TFPolicyGraph
|
||||
from ray.tune.registry import get_registry
|
||||
from ray.tune.result import TrainingResult
|
||||
|
||||
|
||||
def collect_metrics(local_evaluator, remote_evaluators):
|
||||
"""Gathers episode metrics from CommonPolicyEvaluator instances."""
|
||||
|
||||
episode_rewards = []
|
||||
episode_lengths = []
|
||||
metric_lists = ray.get(
|
||||
[a.apply.remote(lambda ev: ev.sampler.get_metrics())
|
||||
for a in remote_evaluators])
|
||||
metric_lists.append(local_evaluator.sampler.get_metrics())
|
||||
for metrics in metric_lists:
|
||||
for episode in metrics:
|
||||
episode_lengths.append(episode.episode_length)
|
||||
episode_rewards.append(episode.episode_reward)
|
||||
if episode_rewards:
|
||||
min_reward = min(episode_rewards)
|
||||
max_reward = max(episode_rewards)
|
||||
else:
|
||||
min_reward = float('nan')
|
||||
max_reward = float('nan')
|
||||
avg_reward = np.mean(episode_rewards)
|
||||
avg_length = np.mean(episode_lengths)
|
||||
timesteps = np.sum(episode_lengths)
|
||||
|
||||
return TrainingResult(
|
||||
episode_reward_max=max_reward,
|
||||
episode_reward_min=min_reward,
|
||||
episode_reward_mean=avg_reward,
|
||||
episode_len_mean=avg_length,
|
||||
episodes_total=len(episode_lengths),
|
||||
timesteps_this_iter=timesteps)
|
||||
|
||||
|
||||
class CommonPolicyEvaluator(PolicyEvaluator):
|
||||
"""Policy evaluator implementation that operates on a rllib.PolicyGraph.
|
||||
|
||||
TODO: vector env
|
||||
TODO: multi-agent
|
||||
TODO: consumer buffering for multi-agent
|
||||
TODO: complete episode batch mode
|
||||
|
||||
Examples:
|
||||
# Create a policy evaluator and using it to collect experiences.
|
||||
>>> evaluator = CommonPolicyEvaluator(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=PGPolicyGraph)
|
||||
>>> print(evaluator.sample().keys())
|
||||
{"obs": [[...]], "actions": [[...]], "rewards": [[...]],
|
||||
"dones": [[...]], "new_obs": [[...]]}
|
||||
|
||||
# Creating policy evaluators using optimizer_cls.make().
|
||||
>>> optimizer = LocalSyncOptimizer.make(
|
||||
evaluator_cls=CommonPolicyEvaluator,
|
||||
evaluator_args={
|
||||
"env_creator": lambda _: gym.make("CartPole-v0"),
|
||||
"policy_graph": PGPolicyGraph,
|
||||
},
|
||||
num_workers=10)
|
||||
>>> for _ in range(10): optimizer.step()
|
||||
"""
|
||||
|
||||
@classmethod
|
||||
def as_remote(cls, num_cpus=None, num_gpus=None):
|
||||
return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
env_creator,
|
||||
policy_graph,
|
||||
tf_session_creator=None,
|
||||
batch_steps=100,
|
||||
batch_mode="truncate_episodes",
|
||||
preprocessor_pref="rllib",
|
||||
sample_async=False,
|
||||
compress_observations=False,
|
||||
observation_filter="NoFilter",
|
||||
registry=None,
|
||||
env_config=None,
|
||||
model_config=None,
|
||||
policy_config=None):
|
||||
"""Initialize a policy evaluator.
|
||||
|
||||
Arguments:
|
||||
env_creator (func): Function that returns a gym.Env given an
|
||||
env config dict.
|
||||
policy_graph (class): A class implementing rllib.PolicyGraph or
|
||||
rllib.TFPolicyGraph.
|
||||
tf_session_creator (func): A function that returns a TF session.
|
||||
This is optional and only useful with TFPolicyGraph.
|
||||
batch_steps (int): The target number of env transitions to include
|
||||
in each sample batch returned from this evaluator.
|
||||
batch_mode (str): One of the following choices:
|
||||
complete_episodes: each batch will be at least batch_steps
|
||||
in size, and will include one or more complete episodes.
|
||||
truncate_episodes: each batch will be around batch_steps
|
||||
in size, and include transitions from one episode only.
|
||||
pack_episodes: each batch will be exactly batch_steps in
|
||||
size, and may include transitions from multiple episodes.
|
||||
preprocessor_pref (str): Whether to prefer RLlib preprocessors
|
||||
("rllib") or deepmind ("deepmind") when applicable.
|
||||
sample_async (bool): Whether to compute samples asynchronously in
|
||||
the background, which improves throughput but can cause samples
|
||||
to be slightly off-policy.
|
||||
compress_observations (bool): If true, compress the observations
|
||||
returned.
|
||||
observation_filter (str): Name of observation filter to use.
|
||||
registry (tune.Registry): User-registered objects. Pass in the
|
||||
value from tune.registry.get_registry() if you're having
|
||||
trouble resolving things like custom envs.
|
||||
env_config (dict): Config to pass to the env creator.
|
||||
model_config (dict): Config to use when creating the policy model.
|
||||
policy_config (dict): Config to pass to the policy.
|
||||
"""
|
||||
|
||||
registry = registry or get_registry()
|
||||
env_config = env_config or {}
|
||||
policy_config = policy_config or {}
|
||||
model_config = model_config or {}
|
||||
|
||||
assert batch_mode in [
|
||||
"complete_episodes", "truncate_episodes", "pack_episodes"]
|
||||
self.env_creator = env_creator
|
||||
self.policy_graph = policy_graph
|
||||
self.batch_steps = batch_steps
|
||||
self.batch_mode = batch_mode
|
||||
self.compress_observations = compress_observations
|
||||
|
||||
self.env = env_creator(env_config)
|
||||
is_atari = hasattr(self.env.unwrapped, "ale")
|
||||
if is_atari and "custom_preprocessor" not in model_config and \
|
||||
preprocessor_pref == "deepmind":
|
||||
self.env = wrap_deepmind(self.env, dim=model_config.get("dim", 80))
|
||||
else:
|
||||
self.env = ModelCatalog.get_preprocessor_as_wrapper(
|
||||
registry, self.env, model_config)
|
||||
|
||||
self.vectorized = hasattr(self.env, "vector_reset")
|
||||
self.policy_map = {}
|
||||
|
||||
if issubclass(policy_graph, TFPolicyGraph):
|
||||
with tf.Graph().as_default():
|
||||
if tf_session_creator:
|
||||
self.sess = tf_session_creator()
|
||||
else:
|
||||
self.sess = tf.Session(config=tf.ConfigProto(
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
with self.sess.as_default():
|
||||
policy = policy_graph(
|
||||
self.env.observation_space, self.env.action_space,
|
||||
registry, policy_config)
|
||||
else:
|
||||
policy = policy_graph(
|
||||
self.env.observation_space, self.env.action_space,
|
||||
registry, policy_config)
|
||||
self.policy_map = {
|
||||
"default": policy
|
||||
}
|
||||
|
||||
self.obs_filter = get_filter(
|
||||
observation_filter, self.env.observation_space.shape)
|
||||
self.filters = {"obs_filter": self.obs_filter}
|
||||
|
||||
if self.vectorized:
|
||||
raise NotImplementedError("Vector envs not yet supported")
|
||||
else:
|
||||
if batch_mode not in [
|
||||
"pack_episodes", "truncate_episodes", "complete_episodes"]:
|
||||
raise NotImplementedError("Batch mode not yet supported")
|
||||
pack = batch_mode == "pack_episodes"
|
||||
if batch_mode == "complete_episodes":
|
||||
batch_steps = 999999
|
||||
if sample_async:
|
||||
self.sampler = AsyncSampler(
|
||||
self.env, self.policy_map["default"], self.obs_filter,
|
||||
batch_steps, pack=pack)
|
||||
self.sampler.start()
|
||||
else:
|
||||
self.sampler = SyncSampler(
|
||||
self.env, self.policy_map["default"], self.obs_filter,
|
||||
batch_steps, pack=pack)
|
||||
|
||||
def sample(self):
|
||||
"""Evaluate the current policies and return a batch of experiences.
|
||||
|
||||
Return:
|
||||
SampleBatch from evaluating the current policies.
|
||||
"""
|
||||
|
||||
batch = self.policy_map["default"].postprocess_trajectory(
|
||||
self.sampler.get_data())
|
||||
|
||||
if self.compress_observations:
|
||||
batch["obs"] = [pack(o) for o in batch["obs"]]
|
||||
batch["new_obs"] = [pack(o) for o in batch["new_obs"]]
|
||||
|
||||
return batch
|
||||
|
||||
def apply(self, func):
|
||||
"""Apply the given function to this evaluator instance."""
|
||||
|
||||
return func(self)
|
||||
|
||||
def for_policy(self, func):
|
||||
"""Apply the given function to this evaluator's default policy."""
|
||||
|
||||
return func(self.policy_map["default"])
|
||||
|
||||
def sync_filters(self, new_filters):
|
||||
"""Changes self's filter to given and rebases any accumulated delta.
|
||||
|
||||
Args:
|
||||
new_filters (dict): Filters with new state to update local copy.
|
||||
"""
|
||||
assert all(k in new_filters for k in self.filters)
|
||||
for k in self.filters:
|
||||
self.filters[k].sync(new_filters[k])
|
||||
|
||||
def get_filters(self, flush_after=False):
|
||||
"""Returns a snapshot of filters.
|
||||
|
||||
Args:
|
||||
flush_after (bool): Clears the filter buffer state.
|
||||
|
||||
Returns:
|
||||
return_filters (dict): Dict for serializable filters
|
||||
"""
|
||||
return_filters = {}
|
||||
for k, f in self.filters.items():
|
||||
return_filters[k] = f.as_serializable()
|
||||
if flush_after:
|
||||
f.clear_buffer()
|
||||
return return_filters
|
||||
|
||||
def get_weights(self):
|
||||
return self.policy_map["default"].get_weights()
|
||||
|
||||
def set_weights(self, weights):
|
||||
return self.policy_map["default"].set_weights(weights)
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
return self.policy_map["default"].compute_gradients(samples)
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
return self.policy_map["default"].apply_gradients(grads)
|
||||
|
||||
def compute_apply(self, samples):
|
||||
grad_fetch, apply_fetch = self.policy_map["default"].compute_apply(
|
||||
samples)
|
||||
return grad_fetch
|
||||
|
||||
def save(self):
|
||||
filters = self.get_filters(flush_after=True)
|
||||
state = self.policy_map["default"].get_state()
|
||||
return pickle.dumps({"filters": filters, "state": state})
|
||||
|
||||
def restore(self, objs):
|
||||
objs = pickle.loads(objs)
|
||||
self.sync_filters(objs["filters"])
|
||||
self.policy_map["default"].set_state(objs["state"])
|
||||
@@ -0,0 +1,132 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
class PolicyGraph(object):
|
||||
"""An agent policy and loss, i.e., a TFPolicyGraph or other subclass.
|
||||
|
||||
This object defines how to act in the environment, and also losses used to
|
||||
improve the policy based on its experiences. Note that both policy and
|
||||
loss are defined together for convenience, though the policy itself is
|
||||
logically separate.
|
||||
|
||||
All policies can directly extend PolicyGraph, however TensorFlow users may
|
||||
find TFPolicyGraph simpler to implement. TFPolicyGraph also enables RLlib
|
||||
to apply TensorFlow-specific optimizations such as fusing multiple policy
|
||||
graphs and multi-GPU support.
|
||||
"""
|
||||
|
||||
def __init__(self, registry, observation_space, action_space, config):
|
||||
"""Initialize the graph.
|
||||
|
||||
Args:
|
||||
registry (obj): Object registry for user-defined envs, models, etc.
|
||||
observation_space (gym.Space): Observation space of the env.
|
||||
action_space (gym.Space): Action space of the env.
|
||||
config (dict): Policy-specific configuration data.
|
||||
"""
|
||||
pass
|
||||
|
||||
def compute_actions(self, obs_batch, state_batches, is_training=False):
|
||||
"""Compute actions for the current policy.
|
||||
|
||||
Arguments:
|
||||
obs_batch (np.ndarray): batch of observations
|
||||
state_batches (list): list of RNN state input batches, if any
|
||||
is_training (bool): whether we are training the policy
|
||||
|
||||
Returns:
|
||||
actions (np.ndarray): batch of output actions, with shape like
|
||||
[BATCH_SIZE, ACTION_SHAPE].
|
||||
state_outs (list): list of RNN state output batches, if any, with
|
||||
shape like [STATE_SIZE, BATCH_SIZE].
|
||||
info (dict): dictionary of extra feature batches, if any, with
|
||||
shape like {"f1": [BATCH_SIZE, ...], "f2": [BATCH_SIZE, ...]}.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def compute_single_action(self, obs, state, is_training=False):
|
||||
"""Unbatched version of compute_actions.
|
||||
|
||||
Arguments:
|
||||
obs (obj): single observation
|
||||
state_batches (list): list of RNN state inputs, if any
|
||||
is_training (bool): whether we are training the policy
|
||||
|
||||
Returns:
|
||||
actions (obj): single action
|
||||
state_outs (list): list of RNN state outputs, if any
|
||||
info (dict): dictionary of extra features, if any
|
||||
"""
|
||||
|
||||
[action], state_out, info = self.compute_actions(
|
||||
[obs], [[s] for s in state], is_training)
|
||||
return action, [s[0] for s in state_out], \
|
||||
{k: v[0] for k, v in info.items()}
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
"""Implements algorithm-specific trajectory postprocessing.
|
||||
|
||||
Arguments:
|
||||
sample_batch (SampleBatch): batch of experiences for the policy
|
||||
other_agent_batches (dict): In a multi-agent env, this contains the
|
||||
experience batches seen by other agents.
|
||||
|
||||
Returns:
|
||||
SampleBatch: postprocessed sample batch.
|
||||
"""
|
||||
return sample_batch
|
||||
|
||||
def compute_gradients(self, postprocessed_batch):
|
||||
"""Computes gradients against a batch of experiences.
|
||||
|
||||
Returns:
|
||||
grads (list): List of gradient output values
|
||||
info (dict): Extra policy-specific values
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def apply_gradients(self, gradients):
|
||||
"""Applies previously computed gradients.
|
||||
|
||||
Returns:
|
||||
info (dict): Extra policy-specific values
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_weights(self):
|
||||
"""Returns model weights.
|
||||
|
||||
Returns:
|
||||
weights (obj): Serializable copy or view of model weights
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def set_weights(self, weights):
|
||||
"""Sets model weights.
|
||||
|
||||
Arguments:
|
||||
weights (obj): Serializable copy or view of model weights
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_initial_state(self):
|
||||
"""Returns initial RNN state for the current policy."""
|
||||
return []
|
||||
|
||||
def get_state(self):
|
||||
"""Saves all local state.
|
||||
|
||||
Returns:
|
||||
state (obj): Serialized local state.
|
||||
"""
|
||||
return self.get_weights()
|
||||
|
||||
def set_state(self, state):
|
||||
"""Restores all local state.
|
||||
|
||||
Arguments:
|
||||
state (obj): Serialized local state.
|
||||
"""
|
||||
self.set_weights(state)
|
||||
@@ -11,12 +11,12 @@ def discount(x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
|
||||
def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
|
||||
def compute_advantages(rollout, last_r, gamma, lambda_=1.0, use_gae=True):
|
||||
"""Given a rollout, compute its value targets and the advantage.
|
||||
|
||||
Args:
|
||||
rollout (PartialRollout): Partial Rollout Object
|
||||
reward_filter (Filter): Filter for processing advantanges
|
||||
last_r (float): Value estimation for last observation
|
||||
gamma (float): Parameter for GAE
|
||||
lambda_ (float): Parameter for GAE
|
||||
use_gae (bool): Using Generalized Advantage Estamation
|
||||
@@ -32,21 +32,17 @@ def process_rollout(rollout, reward_filter, gamma, lambda_=1.0, use_gae=True):
|
||||
|
||||
if use_gae:
|
||||
assert "vf_preds" in rollout, "Values not found!"
|
||||
vpred_t = np.stack(rollout["vf_preds"] +
|
||||
[np.array(rollout.last_r)]).squeeze()
|
||||
vpred_t = np.concatenate([rollout["vf_preds"], np.array([last_r])])
|
||||
delta_t = traj["rewards"] + gamma * vpred_t[1:] - vpred_t[:-1]
|
||||
# This formula for the advantage comes
|
||||
# "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
|
||||
traj["advantages"] = discount(delta_t, gamma * lambda_)
|
||||
traj["value_targets"] = traj["advantages"] + traj["vf_preds"]
|
||||
else:
|
||||
rewards_plus_v = np.stack(rollout["rewards"] +
|
||||
[np.array(rollout.last_r)]).squeeze()
|
||||
rewards_plus_v = np.concatenate(
|
||||
[rollout["rewards"], np.array([last_r])])
|
||||
traj["advantages"] = discount(rewards_plus_v, gamma)[:-1]
|
||||
|
||||
for i in range(traj["advantages"].shape[0]):
|
||||
traj["advantages"][i] = reward_filter(traj["advantages"][i])
|
||||
|
||||
traj["advantages"] = traj["advantages"].copy()
|
||||
|
||||
assert all(val.shape[0] == trajsize for val in traj.values()), \
|
||||
|
||||
@@ -2,80 +2,12 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import six.moves.queue as queue
|
||||
import threading
|
||||
from collections import namedtuple
|
||||
import numpy as np
|
||||
import six.moves.queue as queue
|
||||
import threading
|
||||
|
||||
|
||||
class PartialRollout(object):
|
||||
"""A piece of a complete rollout.
|
||||
|
||||
We run our agent, and process its experience once it has processed enough
|
||||
steps.
|
||||
|
||||
Attributes:
|
||||
data (dict): Stores rollout data. All numpy arrays other than
|
||||
`observations` and `features` will be squeezed.
|
||||
last_r (float): Value of next state. Used for bootstrapping.
|
||||
"""
|
||||
|
||||
fields = ["obs", "actions", "rewards", "new_obs", "dones", "features"]
|
||||
|
||||
def __init__(self, extra_fields=None):
|
||||
"""Initializers internals. Maintains a `last_r` field
|
||||
in support of partial rollouts, used in bootstrapping advantage
|
||||
estimation.
|
||||
|
||||
Args:
|
||||
extra_fields: Optional field for object to keep track.
|
||||
"""
|
||||
if extra_fields:
|
||||
self.fields.extend(extra_fields)
|
||||
self.data = {k: [] for k in self.fields}
|
||||
self.last_r = 0.0
|
||||
|
||||
def add(self, **kwargs):
|
||||
for k, v in kwargs.items():
|
||||
self.data[k] += [v]
|
||||
|
||||
def extend(self, other_rollout):
|
||||
"""Extends internal data structure. Assumes other_rollout contains
|
||||
data that occured afterwards."""
|
||||
|
||||
assert not self.is_terminal()
|
||||
assert all(k in other_rollout.fields for k in self.fields)
|
||||
for k, v in other_rollout.data.items():
|
||||
self.data[k].extend(v)
|
||||
self.last_r = other_rollout.last_r
|
||||
|
||||
def is_terminal(self):
|
||||
"""Check if terminal.
|
||||
|
||||
Returns:
|
||||
terminal (bool): if rollout has terminated."""
|
||||
return self.data["dones"][-1]
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.data[key]
|
||||
|
||||
def __setitem__(self, key, item):
|
||||
self.data[key] = item
|
||||
|
||||
def keys(self):
|
||||
return self.data.keys()
|
||||
|
||||
def items(self):
|
||||
return self.data.items()
|
||||
|
||||
def __iter__(self):
|
||||
return self.data.__iter__()
|
||||
|
||||
def __next__(self):
|
||||
return self.data.__next__()
|
||||
|
||||
def __contains__(self, x):
|
||||
return x in self.data
|
||||
from ray.rllib.optimizers.sample_batch import SampleBatchBuilder
|
||||
|
||||
|
||||
CompletedRollout = namedtuple("CompletedRollout",
|
||||
@@ -92,7 +24,9 @@ class SyncSampler(object):
|
||||
thread."""
|
||||
_async = False
|
||||
|
||||
def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
|
||||
def __init__(
|
||||
self, env, policy, obs_filter, num_local_steps, horizon=None,
|
||||
pack=False):
|
||||
self.num_local_steps = num_local_steps
|
||||
self.horizon = horizon
|
||||
self.env = env
|
||||
@@ -100,7 +34,7 @@ class SyncSampler(object):
|
||||
self._obs_filter = obs_filter
|
||||
self.rollout_provider = _env_runner(self.env, self.policy,
|
||||
self.num_local_steps, self.horizon,
|
||||
self._obs_filter)
|
||||
self._obs_filter, pack)
|
||||
self.metrics_queue = queue.Queue()
|
||||
|
||||
def get_data(self):
|
||||
@@ -128,7 +62,9 @@ class AsyncSampler(threading.Thread):
|
||||
accumulate and the gradient can be calculated on up to 5 batches."""
|
||||
_async = True
|
||||
|
||||
def __init__(self, env, policy, obs_filter, num_local_steps, horizon=None):
|
||||
def __init__(
|
||||
self, env, policy, obs_filter, num_local_steps, horizon=None,
|
||||
pack=False):
|
||||
assert getattr(
|
||||
obs_filter, "is_concurrent",
|
||||
False), ("Observation Filter must support concurrent updates.")
|
||||
@@ -142,6 +78,7 @@ class AsyncSampler(threading.Thread):
|
||||
self._obs_filter = obs_filter
|
||||
self.started = False
|
||||
self.daemon = True
|
||||
self.pack = pack
|
||||
|
||||
def run(self):
|
||||
self.started = True
|
||||
@@ -154,7 +91,7 @@ class AsyncSampler(threading.Thread):
|
||||
def _run(self):
|
||||
rollout_provider = _env_runner(self.env, self.policy,
|
||||
self.num_local_steps, self.horizon,
|
||||
self._obs_filter)
|
||||
self._obs_filter, self.pack)
|
||||
while True:
|
||||
# The timeout variable exists because apparently, if one worker
|
||||
# dies, the other workers won't die with it, unless the timeout is
|
||||
@@ -169,18 +106,18 @@ class AsyncSampler(threading.Thread):
|
||||
"""Gets currently accumulated data.
|
||||
|
||||
Returns:
|
||||
rollout (PartialRollout): trajectory data (unprocessed)
|
||||
rollout (SampleBatch): trajectory data (unprocessed)
|
||||
"""
|
||||
assert self.started, "Sampler never started running!"
|
||||
rollout = self.queue.get(timeout=600.0)
|
||||
if isinstance(rollout, BaseException):
|
||||
raise rollout
|
||||
while not rollout.is_terminal():
|
||||
while not rollout["dones"][-1]:
|
||||
try:
|
||||
part = self.queue.get_nowait()
|
||||
if isinstance(part, BaseException):
|
||||
raise rollout
|
||||
rollout.extend(part)
|
||||
rollout = rollout.concat(part)
|
||||
except queue.Empty:
|
||||
break
|
||||
return rollout
|
||||
@@ -195,7 +132,7 @@ class AsyncSampler(threading.Thread):
|
||||
return completed
|
||||
|
||||
|
||||
def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
def _env_runner(env, policy, num_local_steps, horizon, obs_filter, pack):
|
||||
"""This implements the logic of the thread runner.
|
||||
|
||||
It continually runs the policy, and as long as the rollout exceeds a
|
||||
@@ -206,12 +143,16 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
Args:
|
||||
env: Environment generated by env_creator
|
||||
policy: Policy used to interact with environment. Also sets fields
|
||||
to be included in `PartialRollout`
|
||||
num_local_steps: Number of steps before `PartialRollout` is yielded.
|
||||
to be included in `SampleBatch`
|
||||
num_local_steps: Number of steps before `SampleBatch` is yielded. Set
|
||||
to infinity to yield complete episodes.
|
||||
horizon: Horizon of the episode.
|
||||
obs_filter: Filter used to process observations.
|
||||
pack: Whether to pack multiple episodes into each batch. This
|
||||
guarantees batches will be exactly `num_local_steps` in size.
|
||||
|
||||
Yields:
|
||||
rollout (PartialRollout): Object containing state, action, reward,
|
||||
rollout (SampleBatch): Object containing state, action, reward,
|
||||
terminal condition, and other fields as dictated by `policy`.
|
||||
"""
|
||||
last_observation = obs_filter(env.reset())
|
||||
@@ -221,24 +162,23 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
print("Warning, no horizon specified, assuming infinite")
|
||||
if not horizon:
|
||||
horizon = 999999
|
||||
if hasattr(policy, "get_initial_features"):
|
||||
last_features = policy.get_initial_features()
|
||||
else:
|
||||
last_features = []
|
||||
last_features = policy.get_initial_state()
|
||||
features = last_features
|
||||
length = 0
|
||||
rewards = 0
|
||||
rollout_number = 0
|
||||
|
||||
while True:
|
||||
terminal_end = False
|
||||
rollout = PartialRollout(extra_fields=policy.other_output)
|
||||
batch_builder = SampleBatchBuilder()
|
||||
|
||||
for _ in range(num_local_steps):
|
||||
action, pi_info = policy.compute(last_observation, *last_features)
|
||||
if policy.is_recurrent:
|
||||
features = pi_info["features"]
|
||||
del pi_info["features"]
|
||||
# Assume batch size one for now
|
||||
action, features, pi_info = policy.compute_single_action(
|
||||
last_observation, last_features, is_training=True)
|
||||
for i, state_value in enumerate(last_features):
|
||||
pi_info["state_in_{}".format(i)] = state_value
|
||||
for i, state_value in enumerate(features):
|
||||
pi_info["state_out_{}".format(i)] = state_value
|
||||
observation, reward, terminal, info = env.step(action)
|
||||
observation = obs_filter(observation)
|
||||
|
||||
@@ -252,12 +192,11 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
action = np.concatenate(action, axis=0).flatten()
|
||||
|
||||
# Collect the experience.
|
||||
rollout.add(
|
||||
batch_builder.add_values(
|
||||
obs=last_observation,
|
||||
actions=action,
|
||||
rewards=reward,
|
||||
dones=terminal,
|
||||
features=last_features,
|
||||
new_obs=observation,
|
||||
**pi_info)
|
||||
|
||||
@@ -265,24 +204,18 @@ def _env_runner(env, policy, num_local_steps, horizon, obs_filter):
|
||||
last_features = features
|
||||
|
||||
if terminal:
|
||||
terminal_end = True
|
||||
yield CompletedRollout(length, rewards)
|
||||
|
||||
if (length >= horizon
|
||||
or not env.metadata.get("semantics.autoreset")):
|
||||
if (length >= horizon or
|
||||
not env.metadata.get("semantics.autoreset")):
|
||||
last_observation = obs_filter(env.reset())
|
||||
if hasattr(policy, "get_initial_features"):
|
||||
last_features = policy.get_initial_features()
|
||||
else:
|
||||
last_features = []
|
||||
last_features = policy.get_initial_state()
|
||||
rollout_number += 1
|
||||
length = 0
|
||||
rewards = 0
|
||||
break
|
||||
|
||||
if not terminal_end:
|
||||
rollout.last_r = policy.value(last_observation, *last_features)
|
||||
if not pack:
|
||||
break
|
||||
|
||||
# Once we have enough experience, yield it, and have the ThreadRunner
|
||||
# place it on a queue.
|
||||
yield rollout
|
||||
yield batch_builder.build()
|
||||
|
||||
@@ -0,0 +1,152 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.utils.policy_graph import PolicyGraph
|
||||
|
||||
|
||||
class TFPolicyGraph(PolicyGraph):
|
||||
"""An agent policy and loss implemented in TensorFlow.
|
||||
|
||||
Extending this class enables RLlib to perform TensorFlow specific
|
||||
optimizations on the policy graph, e.g., parallelization across gpus or
|
||||
fusing multiple graphs together in the multi-agent setting.
|
||||
|
||||
All input and output tensors are of shape [BATCH_DIM, ...].
|
||||
|
||||
Examples:
|
||||
>>> policy = TFPolicyGraphSubclass(
|
||||
sess, obs_input, action_sampler, loss, loss_inputs, is_training)
|
||||
|
||||
>>> print(policy.compute_actions([1, 0, 2]))
|
||||
(array([0, 1, 1]), [], {})
|
||||
|
||||
>>> print(policy.postprocess_trajectory(SampleBatch({...})))
|
||||
SampleBatch({"action": ..., "advantages": ..., ...})
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, sess, obs_input, action_sampler, loss, loss_inputs,
|
||||
is_training, state_inputs=None, state_outputs=None):
|
||||
"""Initialize the policy.
|
||||
|
||||
Arguments:
|
||||
obs_input (Tensor): input placeholder for observations.
|
||||
action_sampler (Tensor): Tensor for sampling an action.
|
||||
loss (Tensor): scalar policy loss output tensor.
|
||||
loss_inputs (list): a (name, placeholder) tuple for each loss
|
||||
input argument. Each placeholder name must correspond to a
|
||||
SampleBatch column key returned by postprocess_trajectory().
|
||||
is_training (Tensor): input placeholder for whether we are
|
||||
currently training the policy.
|
||||
state_inputs (list): list of RNN state output Tensors.
|
||||
state_outputs (list): list of initial state values.
|
||||
"""
|
||||
|
||||
self._sess = sess
|
||||
self._obs_input = obs_input
|
||||
self._sampler = action_sampler
|
||||
self._loss = loss
|
||||
self._loss_inputs = loss_inputs
|
||||
self._is_training = is_training
|
||||
self._state_inputs = state_inputs or []
|
||||
self._state_outputs = state_outputs or []
|
||||
self._optimizer = self.optimizer()
|
||||
self._grads_and_vars = self.gradients(self._optimizer)
|
||||
self._grads = [g for (g, v) in self._grads_and_vars]
|
||||
self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
|
||||
self._variables = ray.experimental.TensorFlowVariables(
|
||||
self._loss, self._sess)
|
||||
|
||||
assert len(self._state_inputs) == len(self._state_outputs) == \
|
||||
len(self.get_initial_state())
|
||||
|
||||
def compute_actions(
|
||||
self, obs_batch, state_batches=None, is_training=False):
|
||||
state_batches = state_batches or []
|
||||
assert len(self._state_inputs) == len(state_batches), \
|
||||
(self._state_inputs, state_batches)
|
||||
feed_dict = self.extra_compute_action_feed_dict()
|
||||
feed_dict[self._obs_input] = obs_batch
|
||||
feed_dict[self._is_training] = is_training
|
||||
for ph, value in zip(self._state_inputs, state_batches):
|
||||
feed_dict[ph] = value
|
||||
fetches = self._sess.run(
|
||||
([self._sampler] + self._state_outputs +
|
||||
[self.extra_compute_action_fetches()]), feed_dict=feed_dict)
|
||||
return fetches[0], fetches[1:-1], fetches[-1]
|
||||
|
||||
def _get_loss_inputs_dict(self, postprocessed_batch):
|
||||
feed_dict = {}
|
||||
for key, ph in self._loss_inputs:
|
||||
# TODO(ekl) fix up handling of RNN inputs so that we can batch
|
||||
# across multiple rollouts
|
||||
if key.startswith("state_in_"):
|
||||
feed_dict[ph] = postprocessed_batch[key][:1] # in state only
|
||||
else:
|
||||
feed_dict[ph] = postprocessed_batch[key]
|
||||
return feed_dict
|
||||
|
||||
def compute_gradients(self, postprocessed_batch):
|
||||
feed_dict = self.extra_compute_grad_feed_dict()
|
||||
feed_dict[self._is_training] = True
|
||||
feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
|
||||
fetches = self._sess.run(
|
||||
[self._grads, self.extra_compute_grad_fetches()],
|
||||
feed_dict=feed_dict)
|
||||
return fetches[0], fetches[1]
|
||||
|
||||
def apply_gradients(self, gradients):
|
||||
assert len(gradients) == len(self._grads), (gradients, self._grads)
|
||||
feed_dict = self.extra_apply_grad_feed_dict()
|
||||
feed_dict[self._is_training] = True
|
||||
for ph, value in zip(self._grads, gradients):
|
||||
feed_dict[ph] = value
|
||||
fetches = self.sess.run(
|
||||
[self._apply_op, self.extra_apply_grad_fetches()],
|
||||
feed_dict=feed_dict)
|
||||
return fetches[1]
|
||||
|
||||
def compute_apply(self, postprocessed_batch):
|
||||
feed_dict = self.extra_compute_grad_feed_dict()
|
||||
feed_dict.update(self.extra_apply_grad_feed_dict())
|
||||
feed_dict.update(self._get_loss_inputs_dict(postprocessed_batch))
|
||||
feed_dict[self._is_training] = True
|
||||
fetches = self._sess.run(
|
||||
[self._apply_op, self.extra_compute_grad_fetches(),
|
||||
self.extra_apply_grad_fetches()],
|
||||
feed_dict=feed_dict)
|
||||
return fetches[1], fetches[2]
|
||||
|
||||
def get_weights(self):
|
||||
return self._variables.get_flat()
|
||||
|
||||
def set_weights(self, weights):
|
||||
return self._variables.set_flat(weights)
|
||||
|
||||
def extra_compute_action_feed_dict(self):
|
||||
return {}
|
||||
|
||||
def extra_compute_action_fetches(self):
|
||||
return {} # e.g, value function
|
||||
|
||||
def extra_compute_grad_feed_dict(self):
|
||||
return {} # e.g, kl_coeff
|
||||
|
||||
def extra_compute_grad_fetches(self):
|
||||
return {} # e.g, td error
|
||||
|
||||
def extra_apply_grad_feed_dict(self):
|
||||
return {}
|
||||
|
||||
def extra_apply_grad_fetches(self):
|
||||
return {} # e.g., batch norm updates
|
||||
|
||||
def optimizer(self):
|
||||
return tf.train.AdamOptimizer()
|
||||
|
||||
def gradients(self, optimizer):
|
||||
return optimizer.compute_gradients(self._loss)
|
||||
@@ -31,6 +31,12 @@ TrainingResult = namedtuple(
|
||||
# (Optional) The mean episode reward if applicable.
|
||||
"episode_reward_mean",
|
||||
|
||||
# (Optional) The min episode reward if applicable.
|
||||
"episode_reward_min",
|
||||
|
||||
# (Optional) The max episode reward if applicable.
|
||||
"episode_reward_max",
|
||||
|
||||
# (Optional) The mean episode length if applicable.
|
||||
"episode_len_mean",
|
||||
|
||||
|
||||
Reference in New Issue
Block a user