[rllib] Try moving RLlib to top level dir (#5324)

This commit is contained in:
Eric Liang
2019-08-05 23:25:49 -07:00
committed by GitHub
parent 384cbfb211
commit 5d7afe8092
309 changed files with 240 additions and 234 deletions
+1
View File
@@ -0,0 +1 @@
../../rllib
-25
View File
@@ -1,25 +0,0 @@
RLlib: Scalable Reinforcement Learning
======================================
RLlib is an open-source library for reinforcement learning that offers both high scalability and a unified API for a variety of applications.
For an overview of RLlib, see the [documentation](http://ray.readthedocs.io/en/latest/rllib.html).
If you've found RLlib useful for your research, you can cite the [paper](https://arxiv.org/abs/1712.09381) as follows:
```
@inproceedings{liang2018rllib,
Author = {Eric Liang and
Richard Liaw and
Robert Nishihara and
Philipp Moritz and
Roy Fox and
Ken Goldberg and
Joseph E. Gonzalez and
Michael I. Jordan and
Ion Stoica},
Title = {{RLlib}: Abstractions for Distributed Reinforcement Learning},
Booktitle = {International Conference on Machine Learning ({ICML})},
Year = {2018}
}
```
-65
View File
@@ -1,65 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import sys
# Note: do not introduce unnecessary library dependencies here, e.g. gym.
# This file is imported from the tune module in order to register RLlib agents.
from ray.tune.registry import register_trainable
from ray.rllib.evaluation.policy_graph import PolicyGraph
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.env.base_env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env.vector_env import VectorEnv
from ray.rllib.env.external_env import ExternalEnv
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.tf_policy import TFPolicy
from ray.rllib.policy.sample_batch import SampleBatch
def _setup_logger():
logger = logging.getLogger("ray.rllib")
handler = logging.StreamHandler()
handler.setFormatter(
logging.Formatter(
"%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
))
logger.addHandler(handler)
logger.propagate = False
if sys.version_info[0] < 3:
logger.warn(
"RLlib Python 2 support is deprecated, and will be removed "
"in a future release.")
def _register_all():
from ray.rllib.agents.registry import ALGORITHMS
from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS
for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys(
)) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
from ray.rllib.agents.registry import get_agent_class
register_trainable(key, get_agent_class(key))
_setup_logger()
_register_all()
__all__ = [
"Policy",
"PolicyGraph",
"TFPolicy",
"TFPolicyGraph",
"RolloutWorker",
"PolicyEvaluator",
"SampleBatch",
"BaseEnv",
"MultiAgentEnv",
"VectorEnv",
"ExternalEnv",
]
-4
View File
@@ -1,4 +0,0 @@
from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.agents.agent import Agent
__all__ = ["Agent", "Trainer", "with_common_config"]
-10
View File
@@ -1,10 +0,0 @@
from ray.rllib.agents.a3c.a3c import A3CTrainer, DEFAULT_CONFIG
from ray.rllib.agents.a3c.a2c import A2CTrainer
from ray.rllib.utils import renamed_agent
A2CAgent = renamed_agent(A2CTrainer)
A3CAgent = renamed_agent(A3CTrainer)
__all__ = [
"A2CAgent", "A3CAgent", "A2CTrainer", "A3CTrainer", "DEFAULT_CONFIG"
]
-25
View File
@@ -1,25 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.a3c.a3c import DEFAULT_CONFIG as A3C_CONFIG, \
validate_config, get_policy_class
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.utils import merge_dicts
A2C_DEFAULT_CONFIG = merge_dicts(
A3C_CONFIG,
{
"sample_batch_size": 20,
"min_iter_time_s": 10,
"sample_async": False,
},
)
A2CTrainer = build_trainer(
name="A2C",
default_config=A2C_DEFAULT_CONFIG,
default_policy=A3CTFPolicy,
get_policy_class=get_policy_class,
validate_config=validate_config)
-67
View File
@@ -1,67 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.optimizers import AsyncGradientsOptimizer
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# Size of rollout batch
"sample_batch_size": 10,
# Use PyTorch as backend - no LSTM support
"use_pytorch": False,
# GAE(gamma) parameter
"lambda": 1.0,
# Max global norm for each gradient calculated by worker
"grad_clip": 40.0,
# Learning rate
"lr": 0.0001,
# Learning rate schedule
"lr_schedule": None,
# Value Function Loss coefficient
"vf_loss_coeff": 0.5,
# Entropy coefficient
"entropy_coeff": 0.01,
# Min time per iteration
"min_iter_time_s": 5,
# Workers sample async. Note that this increases the effective
# sample_batch_size by up to 5x due to async buffering of batches.
"sample_async": True,
})
# __sphinx_doc_end__
# yapf: enable
def get_policy_class(config):
if config["use_pytorch"]:
from ray.rllib.agents.a3c.a3c_torch_policy import \
A3CTorchPolicy
return A3CTorchPolicy
else:
return A3CTFPolicy
def validate_config(config):
if config["entropy_coeff"] < 0:
raise DeprecationWarning("entropy_coeff must be >= 0")
if config["sample_async"] and config["use_pytorch"]:
raise ValueError(
"The sample_async option is not supported with use_pytorch: "
"Multithreading can be lead to crashes if used with pytorch.")
def make_async_optimizer(workers, config):
return AsyncGradientsOptimizer(workers, **config["optimizer"])
A3CTrainer = build_trainer(
name="A3C",
default_config=DEFAULT_CONFIG,
default_policy=A3CTFPolicy,
get_policy_class=get_policy_class,
validate_config=validate_config,
make_policy_optimizer=make_async_optimizer)
@@ -1,133 +0,0 @@
"""Note: Keep in sync with changes to VTraceTFPolicy."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
class A3CLoss(object):
def __init__(self,
action_dist,
actions,
advantages,
v_target,
vf,
vf_loss_coeff=0.5,
entropy_coeff=0.01):
log_prob = action_dist.logp(actions)
# The "policy gradients" loss
self.pi_loss = -tf.reduce_sum(log_prob * advantages)
delta = vf - v_target
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.entropy = tf.reduce_sum(action_dist.entropy())
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
self.entropy * entropy_coeff)
def actor_critic_loss(policy, batch_tensors):
policy.loss = A3CLoss(
policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
batch_tensors[Postprocessing.ADVANTAGES],
batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
return policy.loss.total_loss
def postprocess_advantages(policy,
sample_batch,
other_agent_batches=None,
episode=None):
completed = sample_batch[SampleBatch.DONES][-1]
if completed:
last_r = 0.0
else:
next_state = []
for i in range(len(policy.state_in)):
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
sample_batch[SampleBatch.ACTIONS][-1],
sample_batch[SampleBatch.REWARDS][-1],
*next_state)
return compute_advantages(sample_batch, last_r, policy.config["gamma"],
policy.config["lambda"])
def add_value_function_fetch(policy):
return {SampleBatch.VF_PREDS: policy.vf}
class ValueNetworkMixin(object):
def __init__(self):
self.vf = self.model.value_function()
def _value(self, ob, prev_action, prev_reward, *args):
feed_dict = {
self.get_placeholder(SampleBatch.CUR_OBS): [ob],
self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action],
self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward],
self.seq_lens: [1]
}
assert len(args) == len(self.state_in), \
(args, self.state_in)
for k, v in zip(self.state_in, args):
feed_dict[k] = v
vf = self.get_session().run(self.vf, feed_dict)
return vf[0]
def stats(policy, batch_tensors):
return {
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"policy_entropy": policy.loss.entropy,
"var_gnorm": tf.global_norm([x for x in policy.var_list]),
"vf_loss": policy.loss.vf_loss,
}
def grad_stats(policy, grads):
return {
"grad_gnorm": tf.global_norm(grads),
"vf_explained_var": explained_variance(
policy.get_placeholder(Postprocessing.VALUE_TARGETS), policy.vf),
}
def clip_gradients(policy, optimizer, loss):
grads = tf.gradients(loss, policy.var_list)
grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
clipped_grads = list(zip(grads, policy.var_list))
return clipped_grads
def setup_mixins(policy, obs_space, action_space, config):
ValueNetworkMixin.__init__(policy)
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
A3CTFPolicy = build_tf_policy(
name="A3CTFPolicy",
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
loss_fn=actor_critic_loss,
stats_fn=stats,
grad_stats_fn=grad_stats,
gradients_fn=clip_gradients,
postprocess_fn=postprocess_advantages,
extra_action_fetches_fn=add_value_function_fetch,
before_loss_init=setup_mixins,
mixins=[ValueNetworkMixin, LearningRateSchedule])
@@ -1,91 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch
import torch.nn.functional as F
from torch import nn
import ray
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_policy_template import build_torch_policy
def actor_critic_loss(policy, batch_tensors):
logits, _ = policy.model({
SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
}) # TODO(ekl) seq lens shouldn't be None
values = policy.model.value_function()
dist = policy.dist_class(logits)
log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
policy.entropy = dist.entropy().mean()
policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
log_probs.reshape(-1))
policy.value_err = F.mse_loss(
values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS])
overall_err = sum([
policy.pi_err,
policy.config["vf_loss_coeff"] * policy.value_err,
-policy.config["entropy_coeff"] * policy.entropy,
])
return overall_err
def loss_and_entropy_stats(policy, batch_tensors):
return {
"policy_entropy": policy.entropy.item(),
"policy_loss": policy.pi_err.item(),
"vf_loss": policy.value_err.item(),
}
def add_advantages(policy,
sample_batch,
other_agent_batches=None,
episode=None):
completed = sample_batch[SampleBatch.DONES][-1]
if completed:
last_r = 0.0
else:
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
return compute_advantages(sample_batch, last_r, policy.config["gamma"],
policy.config["lambda"])
def model_value_predictions(policy, input_dict, state_batches, model):
return {SampleBatch.VF_PREDS: model.value_function().cpu().numpy()}
def apply_grad_clipping(policy):
info = {}
if policy.config["grad_clip"]:
total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
policy.config["grad_clip"])
info["grad_gnorm"] = total_norm
return info
def torch_optimizer(policy, config):
return torch.optim.Adam(policy.model.parameters(), lr=config["lr"])
class ValueNetworkMixin(object):
def _value(self, obs):
with self.lock:
obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
_ = self.model({"obs": obs}, [], [1])
return self.model.value_function().detach().cpu().numpy().squeeze()
A3CTorchPolicy = build_torch_policy(
name="A3CTorchPolicy",
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
loss_fn=actor_critic_loss,
stats_fn=loss_and_entropy_stats,
postprocess_fn=add_advantages,
extra_action_out_fn=model_value_predictions,
extra_grad_process_fn=apply_grad_clipping,
optimizer_fn=torch_optimizer,
mixins=[ValueNetworkMixin])
-8
View File
@@ -1,8 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import Trainer
from ray.rllib.utils import renamed_agent
Agent = renamed_agent(Trainer)
-6
View File
@@ -1,6 +0,0 @@
from ray.rllib.agents.ars.ars import (ARSTrainer, DEFAULT_CONFIG)
from ray.rllib.utils import renamed_agent
ARSAgent = renamed_agent(ARSTrainer)
__all__ = ["ARSAgent", "ARSTrainer", "DEFAULT_CONFIG"]
-340
View File
@@ -1,340 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter and from
# https://github.com/modestyachts/ARS
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import logging
import numpy as np
import time
import ray
from ray.rllib.agents import Trainer, with_common_config
from ray.rllib.agents.ars import optimizers
from ray.rllib.agents.ars import policies
from ray.rllib.agents.ars import utils
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.utils.annotations import override
from ray.rllib.utils.memory import ray_get_and_free
from ray.rllib.utils import FilterManager
logger = logging.getLogger(__name__)
Result = namedtuple("Result", [
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
"eval_returns", "eval_lengths"
])
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
"noise_stdev": 0.02, # std deviation of parameter noise
"num_rollouts": 32, # number of perturbs to try
"rollouts_used": 32, # number of perturbs to keep in gradient estimate
"num_workers": 2,
"sgd_stepsize": 0.01, # sgd step-size
"observation_filter": "MeanStdFilter",
"noise_size": 250000000,
"eval_prob": 0.03, # probability of evaluating the parameter rewards
"report_length": 10, # how many of the last rewards we average over
"offset": 0,
})
# __sphinx_doc_end__
# yapf: enable
@ray.remote
def create_shared_noise(count):
"""Create a large array of noise to be shared by all workers."""
seed = 123
noise = np.random.RandomState(seed).randn(count).astype(np.float32)
return noise
class SharedNoiseTable(object):
def __init__(self, noise):
self.noise = noise
assert self.noise.dtype == np.float32
def get(self, i, dim):
return self.noise[i:i + dim]
def sample_index(self, dim):
return np.random.randint(0, len(self.noise) - dim + 1)
def get_delta(self, dim):
idx = self.sample_index(dim)
return idx, self.get(idx, dim)
@ray.remote
class Worker(object):
def __init__(self, config, env_creator, noise, min_task_runtime=0.2):
self.min_task_runtime = min_task_runtime
self.config = config
self.noise = SharedNoiseTable(noise)
self.env = env_creator(config["env_config"])
from ray.rllib import models
self.preprocessor = models.ModelCatalog.get_preprocessor(self.env)
self.sess = utils.make_session(single_threaded=True)
self.policy = policies.GenericPolicy(
self.sess, self.env.action_space, self.env.observation_space,
self.preprocessor, config["observation_filter"], config["model"])
@property
def filters(self):
return {DEFAULT_POLICY_ID: self.policy.get_filter()}
def sync_filters(self, new_filters):
for k in self.filters:
self.filters[k].sync(new_filters[k])
def get_filters(self, flush_after=False):
return_filters = {}
for k, f in self.filters.items():
return_filters[k] = f.as_serializable()
if flush_after:
f.clear_buffer()
return return_filters
def rollout(self, timestep_limit, add_noise=False):
rollout_rewards, rollout_length = policies.rollout(
self.policy,
self.env,
timestep_limit=timestep_limit,
add_noise=add_noise,
offset=self.config["offset"])
return rollout_rewards, rollout_length
def do_rollouts(self, params, timestep_limit=None):
# Set the network weights.
self.policy.set_weights(params)
noise_indices, returns, sign_returns, lengths = [], [], [], []
eval_returns, eval_lengths = [], []
# Perform some rollouts with noise.
while (len(noise_indices) == 0):
if np.random.uniform() < self.config["eval_prob"]:
# Do an evaluation run with no perturbation.
self.policy.set_weights(params)
rewards, length = self.rollout(timestep_limit, add_noise=False)
eval_returns.append(rewards.sum())
eval_lengths.append(length)
else:
# Do a regular run with parameter perturbations.
noise_index = self.noise.sample_index(self.policy.num_params)
perturbation = self.config["noise_stdev"] * self.noise.get(
noise_index, self.policy.num_params)
# These two sampling steps could be done in parallel on
# different actors letting us update twice as frequently.
self.policy.set_weights(params + perturbation)
rewards_pos, lengths_pos = self.rollout(timestep_limit)
self.policy.set_weights(params - perturbation)
rewards_neg, lengths_neg = self.rollout(timestep_limit)
noise_indices.append(noise_index)
returns.append([rewards_pos.sum(), rewards_neg.sum()])
sign_returns.append(
[np.sign(rewards_pos).sum(),
np.sign(rewards_neg).sum()])
lengths.append([lengths_pos, lengths_neg])
return Result(
noise_indices=noise_indices,
noisy_returns=returns,
sign_noisy_returns=sign_returns,
noisy_lengths=lengths,
eval_returns=eval_returns,
eval_lengths=eval_lengths)
class ARSTrainer(Trainer):
"""Large-scale implementation of Augmented Random Search in Ray."""
_name = "ARS"
_default_config = DEFAULT_CONFIG
@override(Trainer)
def _init(self, config, env_creator):
env = env_creator(config["env_config"])
from ray.rllib import models
preprocessor = models.ModelCatalog.get_preprocessor(env)
self.sess = utils.make_session(single_threaded=False)
self.policy = policies.GenericPolicy(
self.sess, env.action_space, env.observation_space, preprocessor,
config["observation_filter"], config["model"])
self.optimizer = optimizers.SGD(self.policy, config["sgd_stepsize"])
self.rollouts_used = config["rollouts_used"]
self.num_rollouts = config["num_rollouts"]
self.report_length = config["report_length"]
# Create the shared noise table.
logger.info("Creating shared noise table.")
noise_id = create_shared_noise.remote(config["noise_size"])
self.noise = SharedNoiseTable(ray.get(noise_id))
# Create the actors.
logger.info("Creating actors.")
self.workers = [
Worker.remote(config, env_creator, noise_id)
for _ in range(config["num_workers"])
]
self.episodes_so_far = 0
self.reward_list = []
self.tstart = time.time()
@override(Trainer)
def _train(self):
config = self.config
theta = self.policy.get_weights()
assert theta.dtype == np.float32
# Put the current policy weights in the object store.
theta_id = ray.put(theta)
# Use the actors to do rollouts, note that we pass in the ID of the
# policy weights.
results, num_episodes, num_timesteps = self._collect_results(
theta_id, config["num_rollouts"])
all_noise_indices = []
all_training_returns = []
all_training_lengths = []
all_eval_returns = []
all_eval_lengths = []
# Loop over the results.
for result in results:
all_eval_returns += result.eval_returns
all_eval_lengths += result.eval_lengths
all_noise_indices += result.noise_indices
all_training_returns += result.noisy_returns
all_training_lengths += result.noisy_lengths
assert len(all_eval_returns) == len(all_eval_lengths)
assert (len(all_noise_indices) == len(all_training_returns) ==
len(all_training_lengths))
self.episodes_so_far += num_episodes
# Assemble the results.
eval_returns = np.array(all_eval_returns)
eval_lengths = np.array(all_eval_lengths)
noise_indices = np.array(all_noise_indices)
noisy_returns = np.array(all_training_returns)
noisy_lengths = np.array(all_training_lengths)
# keep only the best returns
# select top performing directions if rollouts_used < num_rollouts
max_rewards = np.max(noisy_returns, axis=1)
if self.rollouts_used > self.num_rollouts:
self.rollouts_used = self.num_rollouts
percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
idx = np.arange(max_rewards.size)[
max_rewards >= np.percentile(max_rewards, percentile)]
noise_idx = noise_indices[idx]
noisy_returns = noisy_returns[idx, :]
# Compute and take a step.
g, count = utils.batched_weighted_sum(
noisy_returns[:, 0] - noisy_returns[:, 1],
(self.noise.get(index, self.policy.num_params)
for index in noise_idx),
batch_size=min(500, noisy_returns[:, 0].size))
g /= noise_idx.size
# scale the returns by their standard deviation
if not np.isclose(np.std(noisy_returns), 0.0):
g /= np.std(noisy_returns)
assert (g.shape == (self.policy.num_params, )
and g.dtype == np.float32)
# Compute the new weights theta.
theta, update_ratio = self.optimizer.update(-g)
# Set the new weights in the local copy of the policy.
self.policy.set_weights(theta)
# update the reward list
if len(all_eval_returns) > 0:
self.reward_list.append(eval_returns.mean())
# Now sync the filters
FilterManager.synchronize({
DEFAULT_POLICY_ID: self.policy.get_filter()
}, self.workers)
info = {
"weights_norm": np.square(theta).sum(),
"weights_std": np.std(theta),
"grad_norm": np.square(g).sum(),
"update_ratio": update_ratio,
"episodes_this_iter": noisy_lengths.size,
"episodes_so_far": self.episodes_so_far,
}
result = dict(
episode_reward_mean=np.mean(
self.reward_list[-self.report_length:]),
episode_len_mean=eval_lengths.mean(),
timesteps_this_iter=noisy_lengths.sum(),
info=info)
return result
@override(Trainer)
def _stop(self):
# workaround for https://github.com/ray-project/ray/issues/1516
for w in self.workers:
w.__ray_terminate__.remote()
@override(Trainer)
def compute_action(self, observation):
return self.policy.compute(observation, update=True)[0]
def _collect_results(self, theta_id, min_episodes):
num_episodes, num_timesteps = 0, 0
results = []
while num_episodes < min_episodes:
logger.debug(
"Collected {} episodes {} timesteps so far this iter".format(
num_episodes, num_timesteps))
rollout_ids = [
worker.do_rollouts.remote(theta_id) for worker in self.workers
]
# Get the results of the rollouts.
for result in ray_get_and_free(rollout_ids):
results.append(result)
# Update the number of episodes and the number of timesteps
# keeping in mind that result.noisy_lengths is a list of lists,
# where the inner lists have length 2.
num_episodes += sum(len(pair) for pair in result.noisy_lengths)
num_timesteps += sum(
sum(pair) for pair in result.noisy_lengths)
return results, num_episodes, num_timesteps
def __getstate__(self):
return {
"weights": self.policy.get_weights(),
"filter": self.policy.get_filter(),
"episodes_so_far": self.episodes_so_far,
}
def __setstate__(self, state):
self.episodes_so_far = state["episodes_so_far"]
self.policy.set_weights(state["weights"])
self.policy.set_filter(state["filter"])
FilterManager.synchronize({
DEFAULT_POLICY_ID: self.policy.get_filter()
}, self.workers)
-57
View File
@@ -1,57 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class Optimizer(object):
def __init__(self, policy):
self.policy = policy
self.dim = policy.num_params
self.t = 0
def update(self, globalg):
self.t += 1
step = self._compute_step(globalg)
theta = self.policy.get_weights()
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
return theta + step, ratio
def _compute_step(self, globalg):
raise NotImplementedError
class SGD(Optimizer):
def __init__(self, policy, stepsize, momentum=0.0):
Optimizer.__init__(self, policy)
self.v = np.zeros(self.dim, dtype=np.float32)
self.stepsize, self.momentum = stepsize, momentum
def _compute_step(self, globalg):
self.v = self.momentum * self.v + (1. - self.momentum) * globalg
step = -self.stepsize * self.v
return step
class Adam(Optimizer):
def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999,
epsilon=1e-08):
Optimizer.__init__(self, policy)
self.stepsize = stepsize
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = np.zeros(self.dim, dtype=np.float32)
self.v = np.zeros(self.dim, dtype=np.float32)
def _compute_step(self, globalg):
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
(1 - self.beta1**self.t))
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
return step
-115
View File
@@ -1,115 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import numpy as np
import ray
import ray.experimental.tf_utils
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
from ray.rllib.utils.filter import get_filter
from ray.rllib.models import ModelCatalog
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0):
"""Do a rollout.
If add_noise is True, the rollout will take noisy actions with
noise drawn from that stream. Otherwise, no action noise will be added.
Parameters
----------
policy: tf object
policy from which to draw actions
env: GymEnv
environment from which to draw rewards, done, and next state
timestep_limit: int, optional
steps after which to end the rollout
add_noise: bool, optional
indicates whether exploratory action noise should be added
offset: int, optional
value to subtract from the reward. For example, survival bonus
from humanoid
"""
env_timestep_limit = env.spec.max_episode_steps
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
timestep_limit, env_timestep_limit))
rews = []
t = 0
observation = env.reset()
for _ in range(timestep_limit or 999999):
ac = policy.compute(observation, add_noise=add_noise, update=True)[0]
observation, rew, done, _ = env.step(ac)
rew -= np.abs(offset)
rews.append(rew)
t += 1
if done:
break
rews = np.array(rews, dtype=np.float32)
return rews, t
class GenericPolicy(object):
def __init__(self,
sess,
action_space,
obs_space,
preprocessor,
observation_filter,
model_config,
action_noise_std=0.0):
self.sess = sess
self.action_space = action_space
self.action_noise_std = action_noise_std
self.preprocessor = preprocessor
self.observation_filter = get_filter(observation_filter,
self.preprocessor.shape)
self.inputs = tf.placeholder(tf.float32,
[None] + list(self.preprocessor.shape))
# Policy network.
dist_class, dist_dim = ModelCatalog.get_action_dist(
action_space, model_config, dist_type="deterministic")
model = ModelCatalog.get_model({
"obs": self.inputs
}, obs_space, action_space, dist_dim, model_config)
dist = dist_class(model.outputs)
self.sampler = dist.sample()
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
model.outputs, self.sess)
self.num_params = sum(
np.prod(variable.shape.as_list())
for _, variable in self.variables.variables.items())
self.sess.run(tf.global_variables_initializer())
def compute(self, observation, add_noise=False, update=True):
observation = self.preprocessor.transform(observation)
observation = self.observation_filter(observation[None], update=update)
action = self.sess.run(
self.sampler, feed_dict={self.inputs: observation})
action = _unbatch_tuple_actions(action)
if add_noise and isinstance(self.action_space, gym.spaces.Box):
action += np.random.randn(*action.shape) * self.action_noise_std
return action
def set_weights(self, x):
self.variables.set_flat(x)
def set_filter(self, obs_filter):
self.observation_filter = obs_filter
def get_filter(self):
return self.observation_filter
def get_weights(self):
return self.variables.get_flat()
-63
View File
@@ -1,63 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
def compute_ranks(x):
"""Returns ranks in [0, len(x))
Note: This is different from scipy.stats.rankdata, which returns ranks in
[1, len(x)].
"""
assert x.ndim == 1
ranks = np.empty(len(x), dtype=int)
ranks[x.argsort()] = np.arange(len(x))
return ranks
def compute_centered_ranks(x):
y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
y /= (x.size - 1)
y -= 0.5
return y
def make_session(single_threaded):
if not single_threaded:
return tf.Session()
return tf.Session(
config=tf.ConfigProto(
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
def itergroups(items, group_size):
assert group_size >= 1
group = []
for x in items:
group.append(x)
if len(group) == group_size:
yield tuple(group)
del group[:]
if group:
yield tuple(group)
def batched_weighted_sum(weights, vecs, batch_size):
total = 0
num_items_summed = 0
for batch_weights, batch_vecs in zip(
itergroups(weights, batch_size), itergroups(vecs, batch_size)):
assert len(batch_weights) == len(batch_vecs) <= batch_size
total += np.dot(
np.asarray(batch_weights, dtype=np.float32),
np.asarray(batch_vecs, dtype=np.float32))
num_items_summed += len(batch_weights)
return total, num_items_summed
-1
View File
@@ -1 +0,0 @@
Implementation of deep deterministic policy gradients (https://arxiv.org/abs/1509.02971), including an Ape-X variant.
-16
View File
@@ -1,16 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
from ray.rllib.agents.ddpg.td3 import TD3Trainer
from ray.rllib.utils import renamed_agent
ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
DDPGAgent = renamed_agent(DDPGTrainer)
__all__ = [
"DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
"TD3Trainer", "DEFAULT_CONFIG"
]
-37
View File
@@ -1,37 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
DEFAULT_CONFIG as DDPG_CONFIG
from ray.rllib.utils import merge_dicts
APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
DDPG_CONFIG, # see also the options in ddpg.py, which are also supported
{
"optimizer": merge_dicts(
DDPG_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"n_step": 3,
"num_gpus": 0,
"num_workers": 32,
"buffer_size": 2000000,
"learning_starts": 50000,
"train_batch_size": 512,
"sample_batch_size": 50,
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"per_worker_exploration": True,
"worker_side_prioritization": True,
"min_iter_time_s": 30,
},
)
ApexDDPGTrainer = DDPGTrainer.with_updates(
name="APEX_DDPG",
default_config=APEX_DDPG_DEFAULT_CONFIG,
**APEX_TRAINER_PROPERTIES)
-222
View File
@@ -1,222 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer, \
update_worker_explorations
from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
# TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
# In addition to settings below, you can use "exploration_noise_type" and
# "exploration_gauss_act_noise" to get IID Gaussian exploration noise
# instead of OU exploration noise.
# twin Q-net
"twin_q": False,
# delayed policy update
"policy_delay": 1,
# target policy smoothing
# (this also replaces OU exploration noise with IID Gaussian exploration
# noise, for now)
"smooth_target_policy": False,
# gaussian stddev of target action noise for smoothing
"target_noise": 0.2,
# target noise limit (bound)
"target_noise_clip": 0.5,
# === Evaluation ===
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that evaluation is currently not parallelized, and that for Ape-X
# metrics are already only reported for the lowest epsilon workers.
"evaluation_interval": None,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 10,
# === Model ===
# Apply a state preprocessor with spec given by the "model" config option
# (like other RL algorithms). This is mostly useful if you have a weird
# observation shape, like an image. Auto-enabled if a custom model is set.
"use_state_preprocessor": False,
# Postprocess the policy network model output with these hidden layers. If
# use_state_preprocessor is False, then these will be the *only* hidden
# layers in the network.
"actor_hiddens": [400, 300],
# Hidden layers activation of the postprocessing stage of the policy
# network
"actor_hidden_activation": "relu",
# Postprocess the critic network model output with these hidden layers;
# again, if use_state_preprocessor is True, then the state will be
# preprocessed by the model specified with the "model" config option first.
"critic_hiddens": [400, 300],
# Hidden layers activation of the postprocessing state of the critic.
"critic_hidden_activation": "relu",
# N-step Q learning
"n_step": 1,
# === Exploration ===
# Turns on annealing schedule for exploration noise. Exploration is
# annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
# scaled by exploration_fraction. Original DDPG and TD3 papers do not
# anneal noise, so this is False by default.
"exploration_should_anneal": False,
# Max num timesteps for annealing schedules.
"schedule_max_timesteps": 100000,
# Number of env steps to optimize for before returning
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
"exploration_fraction": 0.1,
# Final scaling multiplier for action noise (initial is 1.0)
"exploration_final_scale": 0.02,
# valid values: "ou" (time-correlated, like original DDPG paper),
# "gaussian" (IID, like TD3 paper)
"exploration_noise_type": "ou",
# OU-noise scale; this can be used to scale down magnitude of OU noise
# before adding to actions (requires "exploration_noise_type" to be "ou")
"exploration_ou_noise_scale": 0.1,
# theta for OU
"exploration_ou_theta": 0.15,
# sigma for OU
"exploration_ou_sigma": 0.2,
# gaussian stddev of act noise for exploration (requires
# "exploration_noise_type" to be "gaussian")
"exploration_gaussian_sigma": 0.1,
# If True parameter space noise will be used for exploration
# See https://blog.openai.com/better-exploration-with-parameter-noise/
"parameter_noise": False,
# Until this many timesteps have elapsed, the agent's policy will be
# ignored & it will instead take uniform random actions. Can be used in
# conjunction with learning_starts (which controls when the first
# optimization step happens) to decrease dependence of exploration &
# optimization on initial policy parameters. Note that this will be
# disabled when the action noise scale is set to 0 (e.g during evaluation).
"pure_exploration_steps": 1000,
# Extra configuration that disables exploration.
"evaluation_config": {
"exploration_fraction": 0,
"exploration_final_eps": 0,
},
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
"buffer_size": 50000,
# If True prioritized replay buffer will be used.
"prioritized_replay": True,
# Alpha parameter for prioritized replay buffer.
"prioritized_replay_alpha": 0.6,
# Beta parameter for sampling from prioritized replay buffer.
"prioritized_replay_beta": 0.4,
# Fraction of entire training period over which the beta parameter is
# annealed
"beta_annealing_fraction": 0.2,
# Final value of beta
"final_prioritized_replay_beta": 0.4,
# Epsilon to add to the TD errors when updating priorities.
"prioritized_replay_eps": 1e-6,
# Whether to LZ4 compress observations
"compress_observations": False,
# === Optimization ===
# Learning rate for the critic (Q-function) optimizer.
"critic_lr": 1e-3,
# Learning rate for the actor (policy) optimizer.
"actor_lr": 1e-3,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 0,
# Update the target by \tau * policy + (1-\tau) * target_policy
"tau": 0.002,
# If True, use huber loss instead of squared loss for critic network
# Conventionally, no need to clip gradients if using a huber loss
"use_huber": False,
# Threshold of a huber loss
"huber_threshold": 1.0,
# Weights for L2 regularization
"l2_reg": 1e-6,
# If not None, clip gradients during optimization at this value
"grad_norm_clipping": None,
# How many steps of the model to sample before learning starts.
"learning_starts": 1500,
# Update the replay buffer with this many samples at once. Note that this
# setting applies per-worker if num_workers > 1.
"sample_batch_size": 1,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
"train_batch_size": 256,
# === Parallelism ===
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you're using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to use a distribution of epsilons across workers for exploration.
"per_worker_exploration": False,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
})
# __sphinx_doc_end__
# yapf: enable
def make_exploration_schedule(config, worker_index):
# Modification of DQN's schedule to take into account
# `exploration_ou_noise_scale`
if config["per_worker_exploration"]:
assert config["num_workers"] > 1, "This requires multiple workers"
if worker_index >= 0:
# Exploration constants from the Ape-X paper
max_index = float(config["num_workers"] - 1)
exponent = 1 + worker_index / max_index * 7
return ConstantSchedule(0.4**exponent)
else:
# local ev should have zero exploration so that eval rollouts
# run properly
return ConstantSchedule(0.0)
elif config["exploration_should_anneal"]:
return LinearSchedule(
schedule_timesteps=int(config["exploration_fraction"] *
config["schedule_max_timesteps"]),
initial_p=1.0,
final_p=config["exploration_final_scale"])
else:
# *always* add exploration noise
return ConstantSchedule(1.0)
def setup_ddpg_exploration(trainer):
trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
trainer.explorations = [
make_exploration_schedule(trainer.config, i)
for i in range(trainer.config["num_workers"])
]
def add_pure_exploration_phase(trainer):
global_timestep = trainer.optimizer.num_steps_sampled
pure_expl_steps = trainer.config["pure_exploration_steps"]
if pure_expl_steps:
# tell workers whether they should do pure exploration
only_explore = global_timestep < pure_expl_steps
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.set_pure_exploration_phase(only_explore))
for e in trainer.workers.remote_workers():
e.foreach_trainable_policy.remote(
lambda p, _: p.set_pure_exploration_phase(only_explore))
update_worker_explorations(trainer)
DDPGTrainer = GenericOffPolicyTrainer.with_updates(
name="DDPG",
default_config=DEFAULT_CONFIG,
default_policy=DDPGTFPolicy,
before_init=setup_ddpg_exploration,
before_train_step=add_pure_exploration_phase)
-246
View File
@@ -1,246 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
class DDPGModel(TFModelV2):
"""Extension of standard TFModel for DDPG.
Data flow:
obs -> forward() -> model_out
model_out -> get_policy_output() -> pi(s)
model_out, actions -> get_q_values() -> Q(s, a)
model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
Note that this class by itself is not a valid model unless you
implement forward() in a subclass."""
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
actor_hidden_activation="relu",
actor_hiddens=(400, 300),
critic_hidden_activation="relu",
critic_hiddens=(400, 300),
parameter_noise=False,
twin_q=False,
exploration_ou_sigma=0.2):
"""Initialize variables of this model.
Extra model kwargs:
actor_hidden_activation (str): activation for actor network
actor_hiddens (list): hidden layers sizes for actor network
critic_hidden_activation (str): activation for critic network
critic_hiddens (list): hidden layers sizes for critic network
parameter_noise (bool): use param noise exploration
twin_q (bool): build twin Q networks
exploration_ou_sigma (float): ou noise sigma for exploration
Note that the core layers for forward() are not defined here, this
only defines the layers for the output heads. Those layers for
forward() should be defined in subclasses of DDPGModel.
"""
super(DDPGModel, self).__init__(obs_space, action_space, num_outputs,
model_config, name)
self.exploration_ou_sigma = exploration_ou_sigma
self.action_dim = np.product(action_space.shape)
self.model_out = tf.keras.layers.Input(
shape=(num_outputs, ), name="model_out")
self.actions = tf.keras.layers.Input(
shape=(self.action_dim, ), name="actions")
def build_action_net(action_out):
activation = getattr(tf.nn, actor_hidden_activation)
i = 0
for hidden in actor_hiddens:
if parameter_noise:
import tensorflow.contrib.layers as layers
action_out = layers.fully_connected(
action_out,
num_outputs=hidden,
activation_fn=activation,
normalizer_fn=layers.layer_norm)
else:
action_out = tf.layers.dense(
action_out,
units=hidden,
activation=activation,
name="action_hidden_{}".format(i))
i += 1
return tf.layers.dense(
action_out,
units=self.action_dim,
activation=None,
name="action_out")
action_scope = name + "/action_net"
# TODO(ekl) use keras layers instead of variable scopes
def build_action_net_scope(model_out):
with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
return build_action_net(model_out)
pi_out = tf.keras.layers.Lambda(build_action_net_scope)(self.model_out)
self.action_net = tf.keras.Model(self.model_out, pi_out)
self.register_variables(self.action_net.variables)
# Noise vars for P network except for layer normalization vars
if parameter_noise:
with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
self._build_parameter_noise([
var for var in self.action_net.variables
if "LayerNorm" not in var.name
])
def build_q_net(name, model_out, actions):
q_out = tf.keras.layers.Concatenate(axis=1)([model_out, actions])
activation = getattr(tf.nn, critic_hidden_activation)
for i, n in enumerate(critic_hiddens):
q_out = tf.keras.layers.Dense(
n,
name="{}_hidden_{}".format(name, i),
activation=activation)(q_out)
q_out = tf.keras.layers.Dense(
1, activation=None, name="{}_out".format(name))(q_out)
return tf.keras.Model([model_out, actions], q_out)
self.q_net = build_q_net("q", self.model_out, self.actions)
self.register_variables(self.q_net.variables)
if twin_q:
self.twin_q_net = build_q_net("twin_q", self.model_out,
self.actions)
self.register_variables(self.twin_q_net.variables)
else:
self.twin_q_net = None
def forward(self, input_dict, state, seq_lens):
"""This generates the model_out tensor input.
You must implement this as documented in modelv2.py."""
raise NotImplementedError
def get_policy_output(self, model_out):
"""Return the (unscaled) output of the policy network.
This returns the unscaled outputs of pi(s).
Arguments:
model_out (Tensor): obs embeddings from the model layers, of shape
[BATCH_SIZE, num_outputs].
Returns:
tensor of shape [BATCH_SIZE, action_dim] with range [-inf, inf].
"""
return self.action_net(model_out)
def get_q_values(self, model_out, actions):
"""Return the Q estimates for the most recent forward pass.
This implements Q(s, a).
Arguments:
model_out (Tensor): obs embeddings from the model layers, of shape
[BATCH_SIZE, num_outputs].
actions (Tensor): action values that correspond with the most
recent batch of observations passed through forward(), of shape
[BATCH_SIZE, action_dim].
Returns:
tensor of shape [BATCH_SIZE].
"""
return self.q_net([model_out, actions])
def get_twin_q_values(self, model_out, actions):
"""Same as get_q_values but using the twin Q net.
This implements the twin Q(s, a).
Arguments:
model_out (Tensor): obs embeddings from the model layers, of shape
[BATCH_SIZE, num_outputs].
actions (Tensor): action values that correspond with the most
recent batch of observations passed through forward(), of shape
[BATCH_SIZE, action_dim].
Returns:
tensor of shape [BATCH_SIZE].
"""
return self.twin_q_net([model_out, actions])
def policy_variables(self):
"""Return the list of variables for the policy net."""
return list(self.action_net.variables)
def q_variables(self):
"""Return the list of variables for Q / twin Q nets."""
return self.q_net.variables + (self.twin_q_net.variables
if self.twin_q_net else [])
def update_action_noise(self, session, distance_in_action_space,
exploration_ou_sigma, cur_noise_scale):
"""Update the model action noise settings.
This is called internally by the DDPG policy."""
self.pi_distance = distance_in_action_space
if (distance_in_action_space < exploration_ou_sigma * cur_noise_scale):
# multiplying the sampled OU noise by noise scale is
# equivalent to multiplying the sigma of OU by noise scale
self.parameter_noise_sigma_val *= 1.01
else:
self.parameter_noise_sigma_val /= 1.01
self.parameter_noise_sigma.load(
self.parameter_noise_sigma_val, session=session)
def _build_parameter_noise(self, pnet_params):
assert pnet_params
self.parameter_noise_sigma_val = self.exploration_ou_sigma
self.parameter_noise_sigma = tf.get_variable(
initializer=tf.constant_initializer(
self.parameter_noise_sigma_val),
name="parameter_noise_sigma",
shape=(),
trainable=False,
dtype=tf.float32)
self.parameter_noise = []
# No need to add any noise on LayerNorm parameters
for var in pnet_params:
noise_var = tf.get_variable(
name=var.name.split(":")[0] + "_noise",
shape=var.shape,
initializer=tf.constant_initializer(.0),
trainable=False)
self.parameter_noise.append(noise_var)
remove_noise_ops = list()
for var, var_noise in zip(pnet_params, self.parameter_noise):
remove_noise_ops.append(tf.assign_add(var, -var_noise))
self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
generate_noise_ops = list()
for var_noise in self.parameter_noise:
generate_noise_ops.append(
tf.assign(
var_noise,
tf.random_normal(
shape=var_noise.shape,
stddev=self.parameter_noise_sigma)))
with tf.control_dependencies(generate_noise_ops):
add_noise_ops = list()
for var, var_noise in zip(pnet_params, self.parameter_noise):
add_noise_ops.append(tf.assign_add(var, var_noise))
self.add_noise_op = tf.group(*tuple(add_noise_ops))
self.pi_distance = None
-507
View File
@@ -1,507 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Box
import numpy as np
import logging
import ray
import ray.experimental.tf_utils
from ray.rllib.agents.ddpg.ddpg_model import DDPGModel
from ray.rllib.agents.ddpg.noop_model import NoopModel
from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.annotations import override
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.tf_policy import TFPolicy
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip
tf = try_import_tf()
logger = logging.getLogger(__name__)
def build_ddpg_model(policy, obs_space, action_space, config):
if config["model"]["custom_model"]:
logger.warning(
"Setting use_state_preprocessor=True since a custom model "
"was specified.")
config["use_state_preprocessor"] = True
if not isinstance(action_space, Box):
raise UnsupportedSpaceException(
"Action space {} is not supported for DDPG.".format(action_space))
if len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space has multiple dimensions "
"{}. ".format(action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a Tuple action space, or the multi-agent API.")
if config["use_state_preprocessor"]:
default_model = None # catalog decides
num_outputs = 256 # arbitrary
config["model"]["no_final_linear"] = True
else:
default_model = NoopModel
num_outputs = int(np.product(obs_space.shape))
policy.model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
model_interface=DDPGModel,
default_model=default_model,
name="ddpg_model",
actor_hidden_activation=config["actor_hidden_activation"],
actor_hiddens=config["actor_hiddens"],
critic_hidden_activation=config["critic_hidden_activation"],
critic_hiddens=config["critic_hiddens"],
parameter_noise=config["parameter_noise"],
twin_q=config["twin_q"])
policy.target_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
model_interface=DDPGModel,
default_model=default_model,
name="target_ddpg_model",
actor_hidden_activation=config["actor_hidden_activation"],
actor_hiddens=config["actor_hiddens"],
critic_hidden_activation=config["critic_hidden_activation"],
critic_hiddens=config["critic_hiddens"],
parameter_noise=config["parameter_noise"],
twin_q=config["twin_q"])
return policy.model
def postprocess_trajectory(policy,
sample_batch,
other_agent_batches=None,
episode=None):
if policy.config["parameter_noise"]:
policy.adjust_param_noise_sigma(sample_batch)
return _postprocess_dqn(policy, sample_batch)
def exploration_setting_inputs(policy):
return {
policy.stochastic: True,
policy.noise_scale: policy.cur_noise_scale,
policy.pure_exploration_phase: policy.cur_pure_exploration_phase,
}
def build_action_output(policy, model, input_dict, obs_space, action_space,
config):
model_out, _ = model({
"obs": input_dict[SampleBatch.CUR_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
action_out = model.get_policy_output(model_out)
# Use sigmoid to scale to [0,1], but also double magnitude of input to
# emulate behaviour of tanh activation used in DDPG and TD3 papers.
sigmoid_out = tf.nn.sigmoid(2 * action_out)
# Rescale to actual env policy scale
# (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
# get same dims)
action_range = (action_space.high - action_space.low)[None]
low_action = action_space.low[None]
deterministic_actions = action_range * sigmoid_out + low_action
noise_type = config["exploration_noise_type"]
action_low = action_space.low
action_high = action_space.high
action_range = action_space.high - action_low
def compute_stochastic_actions():
def make_noisy_actions():
# shape of deterministic_actions is [None, dim_action]
if noise_type == "gaussian":
# add IID Gaussian noise for exploration, TD3-style
normal_sample = policy.noise_scale * tf.random_normal(
tf.shape(deterministic_actions),
stddev=config["exploration_gaussian_sigma"])
stochastic_actions = tf.clip_by_value(
deterministic_actions + normal_sample,
action_low * tf.ones_like(deterministic_actions),
action_high * tf.ones_like(deterministic_actions))
elif noise_type == "ou":
# add OU noise for exploration, DDPG-style
zero_acts = action_low.size * [.0]
exploration_sample = tf.get_variable(
name="ornstein_uhlenbeck",
dtype=tf.float32,
initializer=zero_acts,
trainable=False)
normal_sample = tf.random_normal(
shape=[action_low.size], mean=0.0, stddev=1.0)
ou_new = config["exploration_ou_theta"] \
* -exploration_sample \
+ config["exploration_ou_sigma"] * normal_sample
exploration_value = tf.assign_add(exploration_sample, ou_new)
base_scale = config["exploration_ou_noise_scale"]
noise = policy.noise_scale * base_scale \
* exploration_value * action_range
stochastic_actions = tf.clip_by_value(
deterministic_actions + noise,
action_low * tf.ones_like(deterministic_actions),
action_high * tf.ones_like(deterministic_actions))
else:
raise ValueError(
"Unknown noise type '%s' (try 'ou' or 'gaussian')" %
noise_type)
return stochastic_actions
def make_uniform_random_actions():
# pure random exploration option
uniform_random_actions = tf.random_uniform(
tf.shape(deterministic_actions))
# rescale uniform random actions according to action range
tf_range = tf.constant(action_range[None], dtype="float32")
tf_low = tf.constant(action_low[None], dtype="float32")
uniform_random_actions = uniform_random_actions * tf_range \
+ tf_low
return uniform_random_actions
stochastic_actions = tf.cond(
# need to condition on noise_scale > 0 because zeroing
# noise_scale is how a worker signals no noise should be used
# (this is ugly and should be fixed by adding an "eval_mode"
# config flag or something)
tf.logical_and(policy.pure_exploration_phase,
policy.noise_scale > 0),
true_fn=make_uniform_random_actions,
false_fn=make_noisy_actions)
return stochastic_actions
enable_stochastic = tf.logical_and(policy.stochastic,
not config["parameter_noise"])
actions = tf.cond(enable_stochastic, compute_stochastic_actions,
lambda: deterministic_actions)
policy.output_actions = actions
return actions, None
def actor_critic_loss(policy, batch_tensors):
model_out_t, _ = policy.model({
"obs": batch_tensors[SampleBatch.CUR_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
model_out_tp1, _ = policy.model({
"obs": batch_tensors[SampleBatch.NEXT_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
target_model_out_tp1, _ = policy.target_model({
"obs": batch_tensors[SampleBatch.NEXT_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
policy_t = policy.model.get_policy_output(model_out_t)
policy_tp1 = policy.model.get_policy_output(model_out_tp1)
if policy.config["smooth_target_policy"]:
target_noise_clip = policy.config["target_noise_clip"]
clipped_normal_sample = tf.clip_by_value(
tf.random_normal(
tf.shape(policy_tp1), stddev=policy.config["target_noise"]),
-target_noise_clip, target_noise_clip)
policy_tp1_smoothed = tf.clip_by_value(
policy_tp1 + clipped_normal_sample,
policy.action_space.low * tf.ones_like(policy_tp1),
policy.action_space.high * tf.ones_like(policy_tp1))
else:
policy_tp1_smoothed = policy_tp1
# q network evaluation
q_t = policy.model.get_q_values(model_out_t,
batch_tensors[SampleBatch.ACTIONS])
if policy.config["twin_q"]:
twin_q_t = policy.model.get_twin_q_values(
model_out_t, batch_tensors[SampleBatch.ACTIONS])
# Q-values for current policy (no noise) in given current state
q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t)
# target q network evaluation
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
policy_tp1_smoothed)
if policy.config["twin_q"]:
twin_q_tp1 = policy.target_model.get_twin_q_values(
target_model_out_tp1, policy_tp1_smoothed)
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
if policy.config["twin_q"]:
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES],
tf.float32)) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = tf.stop_gradient(
batch_tensors[SampleBatch.REWARDS] +
policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
if policy.config["twin_q"]:
td_error = q_t_selected - q_t_selected_target
twin_td_error = twin_q_t_selected - q_t_selected_target
td_error = td_error + twin_td_error
if policy.config["use_huber"]:
errors = huber_loss(td_error, policy.config["huber_threshold"]) \
+ huber_loss(twin_td_error, policy.config["huber_threshold"])
else:
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(twin_td_error)
else:
td_error = q_t_selected - q_t_selected_target
if policy.config["use_huber"]:
errors = huber_loss(td_error, policy.config["huber_threshold"])
else:
errors = 0.5 * tf.square(td_error)
critic_loss = policy.model.custom_loss(
tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors)
actor_loss = -tf.reduce_mean(q_t_det_policy)
if policy.config["l2_reg"] is not None:
for var in policy.model.policy_variables():
if "bias" not in var.name:
actor_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var)
for var in policy.model.q_variables():
if "bias" not in var.name:
critic_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var)
# save for stats function
policy.q_t = q_t
policy.td_error = td_error
policy.actor_loss = actor_loss
policy.critic_loss = critic_loss
# in a custom apply op we handle the losses separately, but return them
# combined in one loss for now
return actor_loss + critic_loss
def gradients(policy, optimizer, loss):
if policy.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = minimize_and_clip(
policy._actor_optimizer,
policy.actor_loss,
var_list=policy.model.policy_variables(),
clip_val=policy.config["grad_norm_clipping"])
critic_grads_and_vars = minimize_and_clip(
policy._critic_optimizer,
policy.critic_loss,
var_list=policy.model.q_variables(),
clip_val=policy.config["grad_norm_clipping"])
else:
actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
policy.actor_loss, var_list=policy.model.policy_variables())
critic_grads_and_vars = policy._critic_optimizer.compute_gradients(
policy.critic_loss, var_list=policy.model.q_variables())
# save these for later use in build_apply_op
policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
if g is not None]
policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
if g is not None]
grads_and_vars = (
policy._actor_grads_and_vars + policy._critic_grads_and_vars)
return grads_and_vars
def apply_gradients(policy, optimizer, grads_and_vars):
# for policy gradient, update policy net one time v.s.
# update critic net `policy_delay` time(s)
should_apply_actor_opt = tf.equal(
tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
def make_apply_op():
return policy._actor_optimizer.apply_gradients(
policy._actor_grads_and_vars)
actor_op = tf.cond(
should_apply_actor_opt,
true_fn=make_apply_op,
false_fn=lambda: tf.no_op())
critic_op = policy._critic_optimizer.apply_gradients(
policy._critic_grads_and_vars)
# increment global step & apply ops
with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
return tf.group(actor_op, critic_op)
def stats(policy, batch_tensors):
return {
"td_error": tf.reduce_mean(policy.td_error),
"actor_loss": tf.reduce_mean(policy.actor_loss),
"critic_loss": tf.reduce_mean(policy.critic_loss),
"mean_q": tf.reduce_mean(policy.q_t),
"max_q": tf.reduce_max(policy.q_t),
"min_q": tf.reduce_min(policy.q_t),
}
class ExplorationStateMixin(object):
def __init__(self, obs_space, action_space, config):
self.cur_noise_scale = 1.0
self.cur_pure_exploration_phase = False
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
self.pure_exploration_phase = tf.placeholder(
tf.bool, (), name="pure_exploration_phase")
def add_parameter_noise(self):
if self.config["parameter_noise"]:
self.get_session().run(self.model.add_noise_op)
def adjust_param_noise_sigma(self, sample_batch):
# adjust the sigma of parameter space noise
states, noisy_actions = [
list(x) for x in sample_batch.columns(
[SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
]
self.get_session().run(self.model.remove_noise_op)
clean_actions = self.get_session().run(
self.output_actions,
feed_dict={
self.get_placeholder(SampleBatch.CUR_OBS): states,
self.stochastic: False,
self.noise_scale: .0,
self.pure_exploration_phase: False,
})
distance_in_action_space = np.sqrt(
np.mean(np.square(clean_actions - noisy_actions)))
self.model.update_action_noise(
self.get_session(), distance_in_action_space,
self.config["exploration_ou_sigma"], self.cur_noise_scale)
def set_epsilon(self, epsilon):
# set_epsilon is called by optimizer to anneal exploration as
# necessary, and to turn it off during evaluation. The "epsilon" part
# is a carry-over from DQN, which uses epsilon-greedy exploration
# rather than adding action noise to the output of a policy network.
self.cur_noise_scale = epsilon
def set_pure_exploration_phase(self, pure_exploration_phase):
self.cur_pure_exploration_phase = pure_exploration_phase
@override(Policy)
def get_state(self):
return [
TFPolicy.get_state(self), self.cur_noise_scale,
self.cur_pure_exploration_phase
]
@override(Policy)
def set_state(self, state):
TFPolicy.set_state(self, state[0])
self.set_epsilon(state[1])
self.set_pure_exploration_phase(state[2])
class TargetNetworkMixin(object):
def __init__(self, config):
# update_target_fn will be called periodically to copy Q network to
# target Q network
self.tau_value = config.get("tau")
self.tau = tf.placeholder(tf.float32, (), name="tau")
update_target_expr = []
model_vars = self.model.trainable_variables()
target_model_vars = self.target_model.trainable_variables()
assert len(model_vars) == len(target_model_vars), \
(model_vars, target_model_vars)
for var, var_target in zip(model_vars, target_model_vars):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
logger.debug("Update target op {}".format(var_target))
self.update_target_expr = tf.group(*update_target_expr)
# Hard initial update
self.update_target(tau=1.0)
# support both hard and soft sync
def update_target(self, tau=None):
tau = tau or self.tau_value
return self.get_session().run(
self.update_target_expr, feed_dict={self.tau: tau})
class ActorCriticOptimizerMixin(object):
def __init__(self, config):
# create global step for counting the number of update operations
self.global_step = tf.train.get_or_create_global_step()
# use separate optimizers for actor & critic
self._actor_optimizer = tf.train.AdamOptimizer(
learning_rate=config["actor_lr"])
self._critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["critic_lr"])
class ComputeTDErrorMixin(object):
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
if not self.loss_initialized():
return np.zeros_like(rew_t)
td_err = self.get_session().run(
self.td_error,
feed_dict={
self.get_placeholder(SampleBatch.CUR_OBS): [
np.array(ob) for ob in obs_t
],
self.get_placeholder(SampleBatch.ACTIONS): act_t,
self.get_placeholder(SampleBatch.REWARDS): rew_t,
self.get_placeholder(SampleBatch.NEXT_OBS): [
np.array(ob) for ob in obs_tp1
],
self.get_placeholder(SampleBatch.DONES): done_mask,
self.get_placeholder(PRIO_WEIGHTS): importance_weights
})
return td_err
def setup_early_mixins(policy, obs_space, action_space, config):
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
ActorCriticOptimizerMixin.__init__(policy, config)
def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, config)
DDPGTFPolicy = build_tf_policy(
name="DDPGTFPolicy",
get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
make_model=build_ddpg_model,
postprocess_fn=postprocess_trajectory,
extra_action_feed_fn=exploration_setting_inputs,
action_sampler_fn=build_action_output,
loss_fn=actor_critic_loss,
stats_fn=stats,
gradients_fn=gradients,
apply_gradients_fn=apply_gradients,
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
mixins=[
TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
ComputeTDErrorMixin
],
before_init=setup_early_mixins,
after_init=setup_late_mixins,
obs_include_prev_action_reward=False)
@@ -1,20 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.models import Model
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
class NoopModel(Model):
"""Trivial model that just returns the obs flattened.
This is the model used if use_state_preprocessor=False."""
@override(Model)
def _build_layers_v2(self, input_dict, num_outputs, options):
out = tf.reshape(input_dict["obs"], [-1, num_outputs])
return out, out
-57
View File
@@ -1,57 +0,0 @@
"""A more stable successor to TD3.
By default, this uses a near-identical configuration to that reported in the
TD3 paper.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
DEFAULT_CONFIG as DDPG_CONFIG
from ray.rllib.utils import merge_dicts
TD3_DEFAULT_CONFIG = merge_dicts(
DDPG_CONFIG,
{
# largest changes: twin Q functions, delayed policy updates, and target
# smoothing
"twin_q": True,
"policy_delay": 2,
"smooth_target_policy": True,
"target_noise": 0.2,
"target_noise_clip": 0.5,
# other changes & things we want to keep fixed: IID Gaussian
# exploration noise, larger actor learning rate, no l2 regularisation,
# no Huber loss, etc.
"exploration_should_anneal": False,
"exploration_noise_type": "gaussian",
"exploration_gaussian_sigma": 0.1,
"learning_starts": 10000,
"pure_exploration_steps": 10000,
"actor_hiddens": [400, 300],
"critic_hiddens": [400, 300],
"n_step": 1,
"gamma": 0.99,
"actor_lr": 1e-3,
"critic_lr": 1e-3,
"l2_reg": 0.0,
"tau": 5e-3,
"train_batch_size": 100,
"use_huber": False,
"target_network_update_freq": 0,
"num_workers": 0,
"num_gpus_per_worker": 0,
"per_worker_exploration": False,
"worker_side_prioritization": False,
"buffer_size": 1000000,
"prioritized_replay": False,
"clip_rewards": False,
"use_state_preprocessor": False,
},
)
TD3Trainer = DDPGTrainer.with_updates(
name="TD3", default_config=TD3_DEFAULT_CONFIG)
-1
View File
@@ -1 +0,0 @@
Code in this package is adapted from https://github.com/openai/baselines/tree/master/baselines/deepq.
-15
View File
@@ -1,15 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.dqn.apex import ApexTrainer
from ray.rllib.agents.dqn.dqn import DQNTrainer, SimpleQTrainer, DEFAULT_CONFIG
from ray.rllib.utils import renamed_agent
DQNAgent = renamed_agent(DQNTrainer)
ApexAgent = renamed_agent(ApexTrainer)
__all__ = [
"DQNAgent", "ApexAgent", "ApexTrainer", "DQNTrainer", "DEFAULT_CONFIG",
"SimpleQTrainer"
]
-84
View File
@@ -1,84 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.dqn.dqn import DQNTrainer, DEFAULT_CONFIG as DQN_CONFIG
from ray.rllib.optimizers import AsyncReplayOptimizer
from ray.rllib.utils import merge_dicts
# yapf: disable
# __sphinx_doc_begin__
APEX_DEFAULT_CONFIG = merge_dicts(
DQN_CONFIG, # see also the options in dqn.py, which are also supported
{
"optimizer": merge_dicts(
DQN_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"n_step": 3,
"num_gpus": 1,
"num_workers": 32,
"buffer_size": 2000000,
"learning_starts": 50000,
"train_batch_size": 512,
"sample_batch_size": 50,
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"per_worker_exploration": True,
"worker_side_prioritization": True,
"min_iter_time_s": 30,
},
)
# __sphinx_doc_end__
# yapf: enable
def defer_make_workers(trainer, env_creator, policy, config):
# Hack to workaround https://github.com/ray-project/ray/issues/2541
# The workers will be creatd later, after the optimizer is created
return trainer._make_workers(env_creator, policy, config, 0)
def make_async_optimizer(workers, config):
assert len(workers.remote_workers()) == 0
extra_config = config["optimizer"].copy()
for key in [
"prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "prioritized_replay_eps"
]:
if key in config:
extra_config[key] = config[key]
opt = AsyncReplayOptimizer(
workers,
learning_starts=config["learning_starts"],
buffer_size=config["buffer_size"],
train_batch_size=config["train_batch_size"],
sample_batch_size=config["sample_batch_size"],
**extra_config)
workers.add_workers(config["num_workers"])
opt._set_workers(workers.remote_workers())
return opt
def update_target_based_on_num_steps_trained(trainer, fetches):
# Ape-X updates based on num steps trained, not sampled
if (trainer.optimizer.num_steps_trained -
trainer.state["last_target_update_ts"] >
trainer.config["target_network_update_freq"]):
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.update_target())
trainer.state["last_target_update_ts"] = (
trainer.optimizer.num_steps_trained)
trainer.state["num_target_updates"] += 1
APEX_TRAINER_PROPERTIES = {
"make_workers": defer_make_workers,
"make_policy_optimizer": make_async_optimizer,
"after_optimizer_step": update_target_based_on_num_steps_trained,
}
ApexTrainer = DQNTrainer.with_updates(
name="APEX", default_config=APEX_DEFAULT_CONFIG, **APEX_TRAINER_PROPERTIES)
@@ -1,261 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
class DistributionalQModel(TFModelV2):
"""Extension of standard TFModel to provide distributional Q values.
It also supports options for noisy nets and parameter space noise.
Data flow:
obs -> forward() -> model_out
model_out -> get_q_value_distributions() -> Q(s, a) atoms
model_out -> get_state_value() -> V(s)
Note that this class by itself is not a valid model unless you
implement forward() in a subclass."""
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
q_hiddens=(256, ),
dueling=False,
num_atoms=1,
use_noisy=False,
v_min=-10.0,
v_max=10.0,
sigma0=0.5,
parameter_noise=False):
"""Initialize variables of this model.
Extra model kwargs:
q_hiddens (list): defines size of hidden layers for the q head.
These will be used to postprocess the model output for the
purposes of computing Q values.
dueling (bool): whether to build the state value head for DDQN
num_atoms (int): if >1, enables distributional DQN
use_noisy (bool): use noisy nets
v_min (float): min value support for distributional DQN
v_max (float): max value support for distributional DQN
sigma0 (float): initial value of noisy nets
parameter_noise (bool): enable layer norm for param noise
Note that the core layers for forward() are not defined here, this
only defines the layers for the Q head. Those layers for forward()
should be defined in subclasses of DistributionalQModel.
"""
super(DistributionalQModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name)
# setup the Q head output (i.e., model for get_q_values)
self.model_out = tf.keras.layers.Input(
shape=(num_outputs, ), name="model_out")
def build_action_value(model_out):
if q_hiddens:
action_out = model_out
for i in range(len(q_hiddens)):
if use_noisy:
action_out = self._noisy_layer(
"hidden_%d" % i, action_out, q_hiddens[i], sigma0)
elif parameter_noise:
import tensorflow.contrib.layers as layers
action_out = layers.fully_connected(
action_out,
num_outputs=q_hiddens[i],
activation_fn=tf.nn.relu,
normalizer_fn=layers.layer_norm)
else:
action_out = tf.layers.dense(
action_out,
units=q_hiddens[i],
activation=tf.nn.relu,
name="hidden_%d" % i)
else:
# Avoid postprocessing the outputs. This enables custom models
# to be used for parametric action DQN.
action_out = model_out
if use_noisy:
action_scores = self._noisy_layer(
"output",
action_out,
self.action_space.n * num_atoms,
sigma0,
non_linear=False)
elif q_hiddens:
action_scores = tf.layers.dense(
action_out,
units=self.action_space.n * num_atoms,
activation=None)
else:
action_scores = model_out
if num_atoms > 1:
# Distributional Q-learning uses a discrete support z
# to represent the action value distribution
z = tf.range(num_atoms, dtype=tf.float32)
z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
support_logits_per_action = tf.reshape(
tensor=action_scores,
shape=(-1, self.action_space.n, num_atoms))
support_prob_per_action = tf.nn.softmax(
logits=support_logits_per_action)
action_scores = tf.reduce_sum(
input_tensor=z * support_prob_per_action, axis=-1)
logits = support_logits_per_action
dist = support_prob_per_action
return [
action_scores, z, support_logits_per_action, logits, dist
]
else:
logits = tf.expand_dims(tf.ones_like(action_scores), -1)
dist = tf.expand_dims(tf.ones_like(action_scores), -1)
return [action_scores, logits, dist]
def build_state_score(model_out):
state_out = model_out
for i in range(len(q_hiddens)):
if use_noisy:
state_out = self._noisy_layer("dueling_hidden_%d" % i,
state_out, q_hiddens[i],
sigma0)
elif parameter_noise:
state_out = tf.contrib.layers.fully_connected(
state_out,
num_outputs=q_hiddens[i],
activation_fn=tf.nn.relu,
normalizer_fn=tf.contrib.layers.layer_norm)
else:
state_out = tf.layers.dense(
state_out, units=q_hiddens[i], activation=tf.nn.relu)
if use_noisy:
state_score = self._noisy_layer(
"dueling_output",
state_out,
num_atoms,
sigma0,
non_linear=False)
else:
state_score = tf.layers.dense(
state_out, units=num_atoms, activation=None)
return state_score
def build_action_value_in_scope(model_out):
with tf.variable_scope(
name + "/action_value", reuse=tf.AUTO_REUSE):
return build_action_value(model_out)
def build_state_score_in_scope(model_out):
with tf.variable_scope(name + "/state_value", reuse=tf.AUTO_REUSE):
return build_state_score(model_out)
q_out = tf.keras.layers.Lambda(build_action_value_in_scope)(
self.model_out)
self.q_value_head = tf.keras.Model(self.model_out, q_out)
self.register_variables(self.q_value_head.variables)
if dueling:
state_out = tf.keras.layers.Lambda(build_state_score_in_scope)(
self.model_out)
self.state_value_head = tf.keras.Model(self.model_out, state_out)
self.register_variables(self.state_value_head.variables)
def forward(self, input_dict, state, seq_lens):
"""This generates the model_out tensor input.
You must implement this as documented in modelv2.py."""
raise NotImplementedError
def get_q_value_distributions(self, model_out):
"""Returns distributional values for Q(s, a) given a state embedding.
Override this in your custom model to customize the Q output head.
Arguments:
model_out (Tensor): embedding from the model layers
Returns:
(action_scores, logits, dist) if num_atoms == 1, otherwise
(action_scores, z, support_logits_per_action, logits, dist)
"""
return self.q_value_head(model_out)
def get_state_value(self, model_out):
"""Returns the state value prediction for the given state embedding."""
return self.state_value_head(model_out)
def _noisy_layer(self,
prefix,
action_in,
out_size,
sigma0,
non_linear=True):
"""
a common dense layer: y = w^{T}x + b
a noisy layer: y = (w + \epsilon_w*\sigma_w)^{T}x +
(b+\epsilon_b*\sigma_b)
where \epsilon are random variables sampled from factorized normal
distributions and \sigma are trainable variables which are expected to
vanish along the training procedure
"""
import tensorflow.contrib.layers as layers
in_size = int(action_in.shape[1])
epsilon_in = tf.random_normal(shape=[in_size])
epsilon_out = tf.random_normal(shape=[out_size])
epsilon_in = self._f_epsilon(epsilon_in)
epsilon_out = self._f_epsilon(epsilon_out)
epsilon_w = tf.matmul(
a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
epsilon_b = epsilon_out
sigma_w = tf.get_variable(
name=prefix + "_sigma_w",
shape=[in_size, out_size],
dtype=tf.float32,
initializer=tf.random_uniform_initializer(
minval=-1.0 / np.sqrt(float(in_size)),
maxval=1.0 / np.sqrt(float(in_size))))
# TF noise generation can be unreliable on GPU
# If generating the noise on the CPU,
# lowering sigma0 to 0.1 may be helpful
sigma_b = tf.get_variable(
name=prefix + "_sigma_b",
shape=[out_size],
dtype=tf.float32, # 0.5~GPU, 0.1~CPU
initializer=tf.constant_initializer(
sigma0 / np.sqrt(float(in_size))))
w = tf.get_variable(
name=prefix + "_fc_w",
shape=[in_size, out_size],
dtype=tf.float32,
initializer=layers.xavier_initializer())
b = tf.get_variable(
name=prefix + "_fc_b",
shape=[out_size],
dtype=tf.float32,
initializer=tf.zeros_initializer())
action_activation = tf.nn.xw_plus_b(action_in, w + sigma_w * epsilon_w,
b + sigma_b * epsilon_b)
if not non_linear:
return action_activation
return tf.nn.relu(action_activation)
def _f_epsilon(self, x):
return tf.sign(x) * tf.sqrt(tf.abs(x))
-300
View File
@@ -1,300 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
from ray import tune
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
from ray.rllib.agents.dqn.simple_q_policy import SimpleQPolicy
from ray.rllib.optimizers import SyncReplayOptimizer
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
logger = logging.getLogger(__name__)
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# === Model ===
# Number of atoms for representing the distribution of return. When
# this is greater than 1, distributional Q-learning is used.
# the discrete supports are bounded by v_min and v_max
"num_atoms": 1,
"v_min": -10.0,
"v_max": 10.0,
# Whether to use noisy network
"noisy": False,
# control the initial value of noisy nets
"sigma0": 0.5,
# Whether to use dueling dqn
"dueling": True,
# Whether to use double dqn
"double_q": True,
# Postprocess model outputs with these hidden layers to compute the
# state and action values. See also the model config in catalog.py.
"hiddens": [256],
# N-step Q learning
"n_step": 1,
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
# exploration_fraction
"schedule_max_timesteps": 100000,
# Minimum env steps to optimize for per train call. This value does
# not affect learning, only the length of iterations.
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
"exploration_fraction": 0.1,
# Final value of random action probability
"exploration_final_eps": 0.02,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 500,
# Use softmax for sampling actions. Required for off policy estimation.
"soft_q": False,
# Softmax temperature. Q values are divided by this value prior to softmax.
# Softmax approaches argmax as the temperature drops to zero.
"softmax_temp": 1.0,
# If True parameter space noise will be used for exploration
# See https://blog.openai.com/better-exploration-with-parameter-noise/
"parameter_noise": False,
# Extra configuration that disables exploration.
"evaluation_config": {
"exploration_fraction": 0,
"exploration_final_eps": 0,
},
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
"buffer_size": 50000,
# If True prioritized replay buffer will be used.
"prioritized_replay": True,
# Alpha parameter for prioritized replay buffer.
"prioritized_replay_alpha": 0.6,
# Beta parameter for sampling from prioritized replay buffer.
"prioritized_replay_beta": 0.4,
# Fraction of entire training period over which the beta parameter is
# annealed
"beta_annealing_fraction": 0.2,
# Final value of beta
"final_prioritized_replay_beta": 0.4,
# Epsilon to add to the TD errors when updating priorities.
"prioritized_replay_eps": 1e-6,
# Whether to LZ4 compress observations
"compress_observations": True,
# === Optimization ===
# Learning rate for adam optimizer
"lr": 5e-4,
# Learning rate schedule
"lr_schedule": None,
# Adam epsilon hyper parameter
"adam_epsilon": 1e-8,
# If not None, clip gradients during optimization at this value
"grad_norm_clipping": 40,
# How many steps of the model to sample before learning starts.
"learning_starts": 1000,
# Update the replay buffer with this many samples at once. Note that
# this setting applies per-worker if num_workers > 1.
"sample_batch_size": 4,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
"train_batch_size": 32,
# === Parallelism ===
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to use a distribution of epsilons across workers for exploration.
"per_worker_exploration": False,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
})
# __sphinx_doc_end__
# yapf: enable
def make_optimizer(workers, config):
return SyncReplayOptimizer(
workers,
learning_starts=config["learning_starts"],
buffer_size=config["buffer_size"],
prioritized_replay=config["prioritized_replay"],
prioritized_replay_alpha=config["prioritized_replay_alpha"],
prioritized_replay_beta=config["prioritized_replay_beta"],
schedule_max_timesteps=config["schedule_max_timesteps"],
beta_annealing_fraction=config["beta_annealing_fraction"],
final_prioritized_replay_beta=config["final_prioritized_replay_beta"],
prioritized_replay_eps=config["prioritized_replay_eps"],
train_batch_size=config["train_batch_size"],
sample_batch_size=config["sample_batch_size"],
**config["optimizer"])
def check_config_and_setup_param_noise(config):
"""Update the config based on settings.
Rewrites sample_batch_size to take into account n_step truncation, and also
adds the necessary callbacks to support parameter space noise exploration.
"""
# Update effective batch size to include n-step
adjusted_batch_size = max(config["sample_batch_size"],
config.get("n_step", 1))
config["sample_batch_size"] = adjusted_batch_size
if config.get("parameter_noise", False):
if config["batch_mode"] != "complete_episodes":
raise ValueError("Exploration with parameter space noise requires "
"batch_mode to be complete_episodes.")
if config.get("noisy", False):
raise ValueError(
"Exploration with parameter space noise and noisy network "
"cannot be used at the same time.")
if config["callbacks"]["on_episode_start"]:
start_callback = config["callbacks"]["on_episode_start"]
else:
start_callback = None
def on_episode_start(info):
# as a callback function to sample and pose parameter space
# noise on the parameters of network
policies = info["policy"]
for pi in policies.values():
pi.add_parameter_noise()
if start_callback:
start_callback(info)
config["callbacks"]["on_episode_start"] = tune.function(
on_episode_start)
if config["callbacks"]["on_episode_end"]:
end_callback = config["callbacks"]["on_episode_end"]
else:
end_callback = None
def on_episode_end(info):
# as a callback function to monitor the distance
# between noisy policy and original policy
policies = info["policy"]
episode = info["episode"]
episode.custom_metrics["policy_distance"] = policies[
DEFAULT_POLICY_ID].model.pi_distance
if end_callback:
end_callback(info)
config["callbacks"]["on_episode_end"] = tune.function(on_episode_end)
def get_initial_state(config):
return {
"last_target_update_ts": 0,
"num_target_updates": 0,
}
def make_exploration_schedule(config, worker_index):
# Use either a different `eps` per worker, or a linear schedule.
if config["per_worker_exploration"]:
assert config["num_workers"] > 1, \
"This requires multiple workers"
if worker_index >= 0:
# Exploration constants from the Ape-X paper
exponent = (
1 + worker_index / float(config["num_workers"] - 1) * 7)
return ConstantSchedule(0.4**exponent)
else:
# local ev should have zero exploration so that eval rollouts
# run properly
return ConstantSchedule(0.0)
return LinearSchedule(
schedule_timesteps=int(
config["exploration_fraction"] * config["schedule_max_timesteps"]),
initial_p=1.0,
final_p=config["exploration_final_eps"])
def setup_exploration(trainer):
trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
trainer.explorations = [
make_exploration_schedule(trainer.config, i)
for i in range(trainer.config["num_workers"])
]
def update_worker_explorations(trainer):
global_timestep = trainer.optimizer.num_steps_sampled
exp_vals = [trainer.exploration0.value(global_timestep)]
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.set_epsilon(exp_vals[0]))
for i, e in enumerate(trainer.workers.remote_workers()):
exp_val = trainer.explorations[i].value(global_timestep)
e.foreach_trainable_policy.remote(lambda p, _: p.set_epsilon(exp_val))
exp_vals.append(exp_val)
trainer.train_start_timestep = global_timestep
trainer.cur_exp_vals = exp_vals
def add_trainer_metrics(trainer, result):
global_timestep = trainer.optimizer.num_steps_sampled
result.update(
timesteps_this_iter=global_timestep - trainer.train_start_timestep,
info=dict({
"min_exploration": min(trainer.cur_exp_vals),
"max_exploration": max(trainer.cur_exp_vals),
"num_target_updates": trainer.state["num_target_updates"],
}, **trainer.optimizer.stats()))
def update_target_if_needed(trainer, fetches):
global_timestep = trainer.optimizer.num_steps_sampled
if global_timestep - trainer.state["last_target_update_ts"] > \
trainer.config["target_network_update_freq"]:
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.update_target())
trainer.state["last_target_update_ts"] = global_timestep
trainer.state["num_target_updates"] += 1
def collect_metrics(trainer):
if trainer.config["per_worker_exploration"]:
# Only collect metrics from the third of workers with lowest eps
result = trainer.collect_metrics(
selected_workers=trainer.workers.remote_workers()[
-len(trainer.workers.remote_workers()) // 3:])
else:
result = trainer.collect_metrics()
return result
def disable_exploration(trainer):
trainer.evaluation_workers.local_worker().foreach_policy(
lambda p, _: p.set_epsilon(0))
GenericOffPolicyTrainer = build_trainer(
name="GenericOffPolicyAlgorithm",
default_policy=None,
default_config=DEFAULT_CONFIG,
validate_config=check_config_and_setup_param_noise,
get_initial_state=get_initial_state,
make_policy_optimizer=make_optimizer,
before_init=setup_exploration,
before_train_step=update_worker_explorations,
after_optimizer_step=update_target_if_needed,
after_train_result=add_trainer_metrics,
collect_metrics_fn=collect_metrics,
before_evaluate_fn=disable_exploration)
DQNTrainer = GenericOffPolicyTrainer.with_updates(
name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG)
SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQPolicy)
-504
View File
@@ -1,504 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Discrete
import numpy as np
from scipy.stats import entropy
import ray
from ray.rllib.agents.dqn.distributional_q_model import DistributionalQModel
from ray.rllib.agents.dqn.simple_q_policy import ExplorationStateMixin, \
TargetNetworkMixin
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.utils.tf_ops import huber_loss, reduce_mean_ignore_inf, \
minimize_and_clip
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
# Importance sampling weights for prioritized replay
PRIO_WEIGHTS = "weights"
class QLoss(object):
def __init__(self,
q_t_selected,
q_logits_t_selected,
q_tp1_best,
q_dist_tp1_best,
importance_weights,
rewards,
done_mask,
gamma=0.99,
n_step=1,
num_atoms=1,
v_min=-10.0,
v_max=10.0):
if num_atoms > 1:
# Distributional Q-learning which corresponds to an entropy loss
z = tf.range(num_atoms, dtype=tf.float32)
z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
# (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
r_tau = tf.expand_dims(
rewards, -1) + gamma**n_step * tf.expand_dims(
1.0 - done_mask, -1) * tf.expand_dims(z, 0)
r_tau = tf.clip_by_value(r_tau, v_min, v_max)
b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
lb = tf.floor(b)
ub = tf.ceil(b)
# indispensable judgement which is missed in most implementations
# when b happens to be an integer, lb == ub, so pr_j(s', a*) will
# be discarded because (ub-b) == (b-lb) == 0
floor_equal_ceil = tf.to_float(tf.less(ub - lb, 0.5))
l_project = tf.one_hot(
tf.cast(lb, dtype=tf.int32),
num_atoms) # (batch_size, num_atoms, num_atoms)
u_project = tf.one_hot(
tf.cast(ub, dtype=tf.int32),
num_atoms) # (batch_size, num_atoms, num_atoms)
ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil)
mu_delta = q_dist_tp1_best * (b - lb)
ml_delta = tf.reduce_sum(
l_project * tf.expand_dims(ml_delta, -1), axis=1)
mu_delta = tf.reduce_sum(
u_project * tf.expand_dims(mu_delta, -1), axis=1)
m = ml_delta + mu_delta
# Rainbow paper claims that using this cross entropy loss for
# priority is robust and insensitive to `prioritized_replay_alpha`
self.td_error = tf.nn.softmax_cross_entropy_with_logits(
labels=m, logits=q_logits_t_selected)
self.loss = tf.reduce_mean(self.td_error * importance_weights)
self.stats = {
# TODO: better Q stats for dist dqn
"mean_td_error": tf.reduce_mean(self.td_error),
}
else:
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
# compute the error (potentially clipped)
self.td_error = (
q_t_selected - tf.stop_gradient(q_t_selected_target))
self.loss = tf.reduce_mean(
importance_weights * huber_loss(self.td_error))
self.stats = {
"mean_q": tf.reduce_mean(q_t_selected),
"min_q": tf.reduce_min(q_t_selected),
"max_q": tf.reduce_max(q_t_selected),
"mean_td_error": tf.reduce_mean(self.td_error),
}
class QValuePolicy(object):
def __init__(self, q_values, observations, num_actions, stochastic, eps,
softmax, softmax_temp):
if softmax:
action_dist = Categorical(q_values / softmax_temp)
self.action = action_dist.sample()
self.action_prob = action_dist.sampled_action_prob()
return
deterministic_actions = tf.argmax(q_values, axis=1)
batch_size = tf.shape(observations)[0]
# Special case masked out actions (q_value ~= -inf) so that we don't
# even consider them for exploration.
random_valid_action_logits = tf.where(
tf.equal(q_values, tf.float32.min),
tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
random_actions = tf.squeeze(
tf.multinomial(random_valid_action_logits, 1), axis=1)
chose_random = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.where(chose_random, random_actions,
deterministic_actions)
self.action = tf.cond(stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
self.action_prob = None
class ComputeTDErrorMixin(object):
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
if not self.loss_initialized():
return np.zeros_like(rew_t)
td_err = self.get_session().run(
self.q_loss.td_error,
feed_dict={
self.get_placeholder(SampleBatch.CUR_OBS): [
np.array(ob) for ob in obs_t
],
self.get_placeholder(SampleBatch.ACTIONS): act_t,
self.get_placeholder(SampleBatch.REWARDS): rew_t,
self.get_placeholder(SampleBatch.NEXT_OBS): [
np.array(ob) for ob in obs_tp1
],
self.get_placeholder(SampleBatch.DONES): done_mask,
self.get_placeholder(PRIO_WEIGHTS): importance_weights,
})
return td_err
def postprocess_trajectory(policy,
sample_batch,
other_agent_batches=None,
episode=None):
if policy.config["parameter_noise"]:
# adjust the sigma of parameter space noise
states = [list(x) for x in sample_batch.columns(["obs"])][0]
noisy_action_distribution = policy.get_session().run(
policy.action_probs, feed_dict={policy.cur_observations: states})
policy.get_session().run(policy.remove_noise_op)
clean_action_distribution = policy.get_session().run(
policy.action_probs, feed_dict={policy.cur_observations: states})
distance_in_action_space = np.mean(
entropy(clean_action_distribution.T, noisy_action_distribution.T))
policy.pi_distance = distance_in_action_space
if (distance_in_action_space <
-np.log(1 - policy.cur_epsilon +
policy.cur_epsilon / policy.num_actions)):
policy.parameter_noise_sigma_val *= 1.01
else:
policy.parameter_noise_sigma_val /= 1.01
policy.parameter_noise_sigma.load(
policy.parameter_noise_sigma_val, session=policy.get_session())
return _postprocess_dqn(policy, sample_batch)
def build_q_model(policy, obs_space, action_space, config):
if not isinstance(action_space, Discrete):
raise UnsupportedSpaceException(
"Action space {} is not supported for DQN.".format(action_space))
if config["hiddens"]:
# try to infer the last layer size, otherwise fall back to 256
num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
config["model"]["no_final_linear"] = True
else:
num_outputs = action_space.n
policy.q_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
model_interface=DistributionalQModel,
name=Q_SCOPE,
num_atoms=config["num_atoms"],
q_hiddens=config["hiddens"],
dueling=config["dueling"],
use_noisy=config["noisy"],
v_min=config["v_min"],
v_max=config["v_max"],
sigma0=config["sigma0"],
parameter_noise=config["parameter_noise"])
policy.target_q_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
model_interface=DistributionalQModel,
name=Q_TARGET_SCOPE,
num_atoms=config["num_atoms"],
q_hiddens=config["hiddens"],
dueling=config["dueling"],
use_noisy=config["noisy"],
v_min=config["v_min"],
v_max=config["v_max"],
sigma0=config["sigma0"],
parameter_noise=config["parameter_noise"])
return policy.q_model
def build_q_networks(policy, q_model, input_dict, obs_space, action_space,
config):
# Action Q network
q_values, q_logits, q_dist = _compute_q_values(
policy, q_model, input_dict[SampleBatch.CUR_OBS], obs_space,
action_space)
policy.q_values = q_values
policy.q_func_vars = q_model.variables()
# Noise vars for Q network except for layer normalization vars
if config["parameter_noise"]:
_build_parameter_noise(
policy,
[var for var in policy.q_func_vars if "LayerNorm" not in var.name])
policy.action_probs = tf.nn.softmax(policy.q_values)
# Action outputs
qvp = QValuePolicy(q_values, input_dict[SampleBatch.CUR_OBS],
action_space.n, policy.stochastic, policy.eps,
config["soft_q"], config["softmax_temp"])
policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob
return policy.output_actions, policy.action_prob
def _build_parameter_noise(policy, pnet_params):
policy.parameter_noise_sigma_val = 1.0
policy.parameter_noise_sigma = tf.get_variable(
initializer=tf.constant_initializer(policy.parameter_noise_sigma_val),
name="parameter_noise_sigma",
shape=(),
trainable=False,
dtype=tf.float32)
policy.parameter_noise = list()
# No need to add any noise on LayerNorm parameters
for var in pnet_params:
noise_var = tf.get_variable(
name=var.name.split(":")[0] + "_noise",
shape=var.shape,
initializer=tf.constant_initializer(.0),
trainable=False)
policy.parameter_noise.append(noise_var)
remove_noise_ops = list()
for var, var_noise in zip(pnet_params, policy.parameter_noise):
remove_noise_ops.append(tf.assign_add(var, -var_noise))
policy.remove_noise_op = tf.group(*tuple(remove_noise_ops))
generate_noise_ops = list()
for var_noise in policy.parameter_noise:
generate_noise_ops.append(
tf.assign(
var_noise,
tf.random_normal(
shape=var_noise.shape,
stddev=policy.parameter_noise_sigma)))
with tf.control_dependencies(generate_noise_ops):
add_noise_ops = list()
for var, var_noise in zip(pnet_params, policy.parameter_noise):
add_noise_ops.append(tf.assign_add(var, var_noise))
policy.add_noise_op = tf.group(*tuple(add_noise_ops))
policy.pi_distance = None
def build_q_losses(policy, batch_tensors):
config = policy.config
# q network evaluation
q_t, q_logits_t, q_dist_t = _compute_q_values(
policy, policy.q_model, batch_tensors[SampleBatch.CUR_OBS],
policy.observation_space, policy.action_space)
# target q network evalution
q_tp1, q_logits_tp1, q_dist_tp1 = _compute_q_values(
policy, policy.target_q_model, batch_tensors[SampleBatch.NEXT_OBS],
policy.observation_space, policy.action_space)
policy.target_q_func_vars = policy.target_q_model.variables()
# q scores for actions which we know were selected in the given state.
one_hot_selection = tf.one_hot(
tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32),
policy.action_space.n)
q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
q_logits_t_selected = tf.reduce_sum(
q_logits_t * tf.expand_dims(one_hot_selection, -1), 1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
q_tp1_using_online_net, q_logits_tp1_using_online_net, \
q_dist_tp1_using_online_net = _compute_q_values(
policy, policy.q_model,
batch_tensors[SampleBatch.NEXT_OBS],
policy.observation_space, policy.action_space)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best_one_hot_selection = tf.one_hot(q_tp1_best_using_online_net,
policy.action_space.n)
q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
q_dist_tp1_best = tf.reduce_sum(
q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
else:
q_tp1_best_one_hot_selection = tf.one_hot(
tf.argmax(q_tp1, 1), policy.action_space.n)
q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
q_dist_tp1_best = tf.reduce_sum(
q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
policy.q_loss = QLoss(
q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best,
batch_tensors[PRIO_WEIGHTS], batch_tensors[SampleBatch.REWARDS],
tf.cast(batch_tensors[SampleBatch.DONES],
tf.float32), config["gamma"], config["n_step"],
config["num_atoms"], config["v_min"], config["v_max"])
return policy.q_loss.loss
def adam_optimizer(policy, config):
return tf.train.AdamOptimizer(
learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"])
def clip_gradients(policy, optimizer, loss):
if policy.config["grad_norm_clipping"] is not None:
grads_and_vars = minimize_and_clip(
optimizer,
loss,
var_list=policy.q_func_vars,
clip_val=policy.config["grad_norm_clipping"])
else:
grads_and_vars = optimizer.compute_gradients(
loss, var_list=policy.q_func_vars)
grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
return grads_and_vars
def exploration_setting_inputs(policy):
return {
policy.stochastic: True,
policy.eps: policy.cur_epsilon,
}
def build_q_stats(policy, batch_tensors):
return dict({
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
}, **policy.q_loss.stats)
def setup_early_mixins(policy, obs_space, action_space, config):
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
def _compute_q_values(policy, model, obs, obs_space, action_space):
config = policy.config
model_out, state = model({
"obs": obs,
"is_training": policy._get_is_training_placeholder(),
}, [], None)
if config["num_atoms"] > 1:
(action_scores, z, support_logits_per_action, logits,
dist) = model.get_q_value_distributions(model_out)
else:
(action_scores, logits,
dist) = model.get_q_value_distributions(model_out)
if config["dueling"]:
state_score = model.get_state_value(model_out)
if config["num_atoms"] > 1:
support_logits_per_action_mean = tf.reduce_mean(
support_logits_per_action, 1)
support_logits_per_action_centered = (
support_logits_per_action - tf.expand_dims(
support_logits_per_action_mean, 1))
support_logits_per_action = tf.expand_dims(
state_score, 1) + support_logits_per_action_centered
support_prob_per_action = tf.nn.softmax(
logits=support_logits_per_action)
value = tf.reduce_sum(
input_tensor=z * support_prob_per_action, axis=-1)
logits = support_logits_per_action
dist = support_prob_per_action
else:
action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
action_scores_centered = action_scores - tf.expand_dims(
action_scores_mean, 1)
value = state_score + action_scores_centered
else:
value = action_scores
return value, logits, dist
def _adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
"""Rewrites the given trajectory fragments to encode n-step rewards.
reward[i] = (
reward[i] * gamma**0 +
reward[i+1] * gamma**1 +
... +
reward[i+n_step-1] * gamma**(n_step-1))
The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
At the end of the trajectory, n is truncated to fit in the traj length.
"""
assert not any(dones[:-1]), "Unexpected done in middle of trajectory"
traj_length = len(rewards)
for i in range(traj_length):
for j in range(1, n_step):
if i + j < traj_length:
new_obs[i] = new_obs[i + j]
dones[i] = dones[i + j]
rewards[i] += gamma**j * rewards[i + j]
def _postprocess_dqn(policy, batch):
# N-step Q adjustments
if policy.config["n_step"] > 1:
_adjust_nstep(policy.config["n_step"], policy.config["gamma"],
batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
batch[SampleBatch.DONES])
if PRIO_WEIGHTS not in batch:
batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])
# Prioritize on the worker side
if batch.count > 0 and policy.config["worker_side_prioritization"]:
td_errors = policy.compute_td_error(
batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
batch[SampleBatch.DONES], batch[PRIO_WEIGHTS])
new_priorities = (
np.abs(td_errors) + policy.config["prioritized_replay_eps"])
batch.data[PRIO_WEIGHTS] = new_priorities
return batch
DQNTFPolicy = build_tf_policy(
name="DQNTFPolicy",
get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
make_model=build_q_model,
action_sampler_fn=build_q_networks,
loss_fn=build_q_losses,
stats_fn=build_q_stats,
postprocess_fn=postprocess_trajectory,
optimizer_fn=adam_optimizer,
gradients_fn=clip_gradients,
extra_action_feed_fn=exploration_setting_inputs,
extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
before_init=setup_early_mixins,
after_init=setup_late_mixins,
obs_include_prev_action_reward=False,
mixins=[
ExplorationStateMixin,
TargetNetworkMixin,
ComputeTDErrorMixin,
LearningRateSchedule,
])
@@ -1,82 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
class SimpleQModel(TFModelV2):
"""Extension of standard TFModel to provide Q values.
Data flow:
obs -> forward() -> model_out
model_out -> get_q_values() -> Q(s, a)
Note that this class by itself is not a valid model unless you
implement forward() in a subclass."""
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
q_hiddens=(256, )):
"""Initialize variables of this model.
Extra model kwargs:
q_hiddens (list): defines size of hidden layers for the q head.
These will be used to postprocess the model output for the
purposes of computing Q values.
Note that the core layers for forward() are not defined here, this
only defines the layers for the Q head. Those layers for forward()
should be defined in subclasses of SimpleQModel.
"""
super(SimpleQModel, self).__init__(obs_space, action_space,
num_outputs, model_config, name)
# setup the Q head output (i.e., model for get_q_values)
self.model_out = tf.keras.layers.Input(
shape=(num_outputs, ), name="model_out")
if q_hiddens:
last_layer = self.model_out
for i, n in enumerate(q_hiddens):
last_layer = tf.keras.layers.Dense(
n, name="q_hidden_{}".format(i),
activation=tf.nn.relu)(last_layer)
q_out = tf.keras.layers.Dense(
action_space.n, activation=None, name="q_out")(last_layer)
else:
q_out = self.model_out
self.q_value_head = tf.keras.Model(self.model_out, q_out)
self.register_variables(self.q_value_head.variables)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
"""This generates the model_out tensor input.
You must implement this as documented in modelv2.py."""
raise NotImplementedError
def get_q_values(self, model_out):
"""Returns Q(s, a) given a feature tensor for the state.
Override this in your custom model to customize the Q output head.
Arguments:
model_out (Tensor): embedding from the model layers
Returns:
action scores Q(s, a) for each action, shape [None, action_space.n]
"""
return self.q_value_head(model_out)
@@ -1,214 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Basic example of a DQN policy without any optimizations."""
from gym.spaces import Discrete
import logging
import ray
from ray.rllib.agents.dqn.simple_q_model import SimpleQModel
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.annotations import override
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.policy.tf_policy import TFPolicy
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.tf_ops import huber_loss
tf = try_import_tf()
logger = logging.getLogger(__name__)
Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
class ExplorationStateMixin(object):
def __init__(self, obs_space, action_space, config):
self.cur_epsilon = 1.0
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
def add_parameter_noise(self):
if self.config["parameter_noise"]:
self.sess.run(self.add_noise_op)
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
@override(Policy)
def get_state(self):
return [TFPolicy.get_state(self), self.cur_epsilon]
@override(Policy)
def set_state(self, state):
TFPolicy.set_state(self, state[0])
self.set_epsilon(state[1])
class TargetNetworkMixin(object):
def __init__(self, obs_space, action_space, config):
# update_target_fn will be called periodically to copy Q network to
# target Q network
update_target_expr = []
assert len(self.q_func_vars) == len(self.target_q_func_vars), \
(self.q_func_vars, self.target_q_func_vars)
for var, var_target in zip(self.q_func_vars, self.target_q_func_vars):
update_target_expr.append(var_target.assign(var))
logger.debug("Update target op {}".format(var_target))
self.update_target_expr = tf.group(*update_target_expr)
def update_target(self):
return self.get_session().run(self.update_target_expr)
def build_q_models(policy, obs_space, action_space, config):
if not isinstance(action_space, Discrete):
raise UnsupportedSpaceException(
"Action space {} is not supported for DQN.".format(action_space))
if config["hiddens"]:
num_outputs = 256
config["model"]["no_final_linear"] = True
else:
num_outputs = action_space.n
policy.q_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
name=Q_SCOPE,
model_interface=SimpleQModel,
q_hiddens=config["hiddens"])
policy.target_q_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
name=Q_TARGET_SCOPE,
model_interface=SimpleQModel,
q_hiddens=config["hiddens"])
return policy.q_model
def build_action_sampler(policy, q_model, input_dict, obs_space, action_space,
config):
# Action Q network
q_values = _compute_q_values(policy, q_model,
input_dict[SampleBatch.CUR_OBS], obs_space,
action_space)
policy.q_values = q_values
policy.q_func_vars = q_model.variables()
# Action outputs
deterministic_actions = tf.argmax(q_values, axis=1)
batch_size = tf.shape(input_dict[SampleBatch.CUR_OBS])[0]
# Special case masked out actions (q_value ~= -inf) so that we don't
# even consider them for exploration.
random_valid_action_logits = tf.where(
tf.equal(q_values, tf.float32.min),
tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
random_actions = tf.squeeze(
tf.multinomial(random_valid_action_logits, 1), axis=1)
chose_random = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=1,
dtype=tf.float32) < policy.eps
stochastic_actions = tf.where(chose_random, random_actions,
deterministic_actions)
action = tf.cond(policy.stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
action_prob = None
return action, action_prob
def build_q_losses(policy, batch_tensors):
# q network evaluation
q_t = _compute_q_values(policy, policy.q_model,
batch_tensors[SampleBatch.CUR_OBS],
policy.observation_space, policy.action_space)
# target q network evalution
q_tp1 = _compute_q_values(policy, policy.target_q_model,
batch_tensors[SampleBatch.NEXT_OBS],
policy.observation_space, policy.action_space)
policy.target_q_func_vars = policy.target_q_model.variables()
# q scores for actions which we know were selected in the given state.
one_hot_selection = tf.one_hot(
tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32),
policy.action_space.n)
q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
# compute estimate of best possible value starting from state at t + 1
dones = tf.cast(batch_tensors[SampleBatch.DONES], tf.float32)
q_tp1_best_one_hot_selection = tf.one_hot(
tf.argmax(q_tp1, 1), policy.action_space.n)
q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
q_tp1_best_masked = (1.0 - dones) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = (batch_tensors[SampleBatch.REWARDS] +
policy.config["gamma"] * q_tp1_best_masked)
# compute the error (potentially clipped)
td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
loss = tf.reduce_mean(huber_loss(td_error))
# save TD error as an attribute for outside access
policy.td_error = td_error
return loss
def _compute_q_values(policy, model, obs, obs_space, action_space):
input_dict = {
"obs": obs,
"is_training": policy._get_is_training_placeholder(),
}
model_out, _ = model(input_dict, [], None)
return model.get_q_values(model_out)
def exploration_setting_inputs(policy):
return {
policy.stochastic: True,
policy.eps: policy.cur_epsilon,
}
def setup_early_mixins(policy, obs_space, action_space, config):
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
SimpleQPolicy = build_tf_policy(
name="SimpleQPolicy",
get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
make_model=build_q_models,
action_sampler_fn=build_action_sampler,
loss_fn=build_q_losses,
extra_action_feed_fn=exploration_setting_inputs,
extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
before_init=setup_early_mixins,
after_init=setup_late_mixins,
obs_include_prev_action_reward=False,
mixins=[
ExplorationStateMixin,
TargetNetworkMixin,
])
-6
View File
@@ -1,6 +0,0 @@
from ray.rllib.agents.es.es import (ESTrainer, DEFAULT_CONFIG)
from ray.rllib.utils import renamed_agent
ESAgent = renamed_agent(ESTrainer)
__all__ = ["ESAgent", "ESTrainer", "DEFAULT_CONFIG"]
-337
View File
@@ -1,337 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import namedtuple
import logging
import numpy as np
import time
import ray
from ray.rllib.agents import Trainer, with_common_config
from ray.rllib.agents.es import optimizers
from ray.rllib.agents.es import policies
from ray.rllib.agents.es import utils
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.utils.annotations import override
from ray.rllib.utils.memory import ray_get_and_free
from ray.rllib.utils import FilterManager
logger = logging.getLogger(__name__)
Result = namedtuple("Result", [
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
"eval_returns", "eval_lengths"
])
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
"l2_coeff": 0.005,
"noise_stdev": 0.02,
"episodes_per_batch": 1000,
"train_batch_size": 10000,
"eval_prob": 0.003,
"return_proc_mode": "centered_rank",
"num_workers": 10,
"stepsize": 0.01,
"observation_filter": "MeanStdFilter",
"noise_size": 250000000,
"report_length": 10,
})
# __sphinx_doc_end__
# yapf: enable
@ray.remote
def create_shared_noise(count):
"""Create a large array of noise to be shared by all workers."""
seed = 123
noise = np.random.RandomState(seed).randn(count).astype(np.float32)
return noise
class SharedNoiseTable(object):
def __init__(self, noise):
self.noise = noise
assert self.noise.dtype == np.float32
def get(self, i, dim):
return self.noise[i:i + dim]
def sample_index(self, dim):
return np.random.randint(0, len(self.noise) - dim + 1)
@ray.remote
class Worker(object):
def __init__(self,
config,
policy_params,
env_creator,
noise,
min_task_runtime=0.2):
self.min_task_runtime = min_task_runtime
self.config = config
self.policy_params = policy_params
self.noise = SharedNoiseTable(noise)
self.env = env_creator(config["env_config"])
from ray.rllib import models
self.preprocessor = models.ModelCatalog.get_preprocessor(
self.env, config["model"])
self.sess = utils.make_session(single_threaded=True)
self.policy = policies.GenericPolicy(
self.sess, self.env.action_space, self.env.observation_space,
self.preprocessor, config["observation_filter"], config["model"],
**policy_params)
@property
def filters(self):
return {DEFAULT_POLICY_ID: self.policy.get_filter()}
def sync_filters(self, new_filters):
for k in self.filters:
self.filters[k].sync(new_filters[k])
def get_filters(self, flush_after=False):
return_filters = {}
for k, f in self.filters.items():
return_filters[k] = f.as_serializable()
if flush_after:
f.clear_buffer()
return return_filters
def rollout(self, timestep_limit, add_noise=True):
rollout_rewards, rollout_length = policies.rollout(
self.policy,
self.env,
timestep_limit=timestep_limit,
add_noise=add_noise)
return rollout_rewards, rollout_length
def do_rollouts(self, params, timestep_limit=None):
# Set the network weights.
self.policy.set_weights(params)
noise_indices, returns, sign_returns, lengths = [], [], [], []
eval_returns, eval_lengths = [], []
# Perform some rollouts with noise.
task_tstart = time.time()
while (len(noise_indices) == 0
or time.time() - task_tstart < self.min_task_runtime):
if np.random.uniform() < self.config["eval_prob"]:
# Do an evaluation run with no perturbation.
self.policy.set_weights(params)
rewards, length = self.rollout(timestep_limit, add_noise=False)
eval_returns.append(rewards.sum())
eval_lengths.append(length)
else:
# Do a regular run with parameter perturbations.
noise_index = self.noise.sample_index(self.policy.num_params)
perturbation = self.config["noise_stdev"] * self.noise.get(
noise_index, self.policy.num_params)
# These two sampling steps could be done in parallel on
# different actors letting us update twice as frequently.
self.policy.set_weights(params + perturbation)
rewards_pos, lengths_pos = self.rollout(timestep_limit)
self.policy.set_weights(params - perturbation)
rewards_neg, lengths_neg = self.rollout(timestep_limit)
noise_indices.append(noise_index)
returns.append([rewards_pos.sum(), rewards_neg.sum()])
sign_returns.append(
[np.sign(rewards_pos).sum(),
np.sign(rewards_neg).sum()])
lengths.append([lengths_pos, lengths_neg])
return Result(
noise_indices=noise_indices,
noisy_returns=returns,
sign_noisy_returns=sign_returns,
noisy_lengths=lengths,
eval_returns=eval_returns,
eval_lengths=eval_lengths)
class ESTrainer(Trainer):
"""Large-scale implementation of Evolution Strategies in Ray."""
_name = "ES"
_default_config = DEFAULT_CONFIG
@override(Trainer)
def _init(self, config, env_creator):
policy_params = {"action_noise_std": 0.01}
env = env_creator(config["env_config"])
from ray.rllib import models
preprocessor = models.ModelCatalog.get_preprocessor(env)
self.sess = utils.make_session(single_threaded=False)
self.policy = policies.GenericPolicy(
self.sess, env.action_space, env.observation_space, preprocessor,
config["observation_filter"], config["model"], **policy_params)
self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
self.report_length = config["report_length"]
# Create the shared noise table.
logger.info("Creating shared noise table.")
noise_id = create_shared_noise.remote(config["noise_size"])
self.noise = SharedNoiseTable(ray.get(noise_id))
# Create the actors.
logger.info("Creating actors.")
self._workers = [
Worker.remote(config, policy_params, env_creator, noise_id)
for _ in range(config["num_workers"])
]
self.episodes_so_far = 0
self.reward_list = []
self.tstart = time.time()
@override(Trainer)
def _train(self):
config = self.config
theta = self.policy.get_weights()
assert theta.dtype == np.float32
# Put the current policy weights in the object store.
theta_id = ray.put(theta)
# Use the actors to do rollouts, note that we pass in the ID of the
# policy weights.
results, num_episodes, num_timesteps = self._collect_results(
theta_id, config["episodes_per_batch"], config["train_batch_size"])
all_noise_indices = []
all_training_returns = []
all_training_lengths = []
all_eval_returns = []
all_eval_lengths = []
# Loop over the results.
for result in results:
all_eval_returns += result.eval_returns
all_eval_lengths += result.eval_lengths
all_noise_indices += result.noise_indices
all_training_returns += result.noisy_returns
all_training_lengths += result.noisy_lengths
assert len(all_eval_returns) == len(all_eval_lengths)
assert (len(all_noise_indices) == len(all_training_returns) ==
len(all_training_lengths))
self.episodes_so_far += num_episodes
# Assemble the results.
eval_returns = np.array(all_eval_returns)
eval_lengths = np.array(all_eval_lengths)
noise_indices = np.array(all_noise_indices)
noisy_returns = np.array(all_training_returns)
noisy_lengths = np.array(all_training_lengths)
# Process the returns.
if config["return_proc_mode"] == "centered_rank":
proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
else:
raise NotImplementedError(config["return_proc_mode"])
# Compute and take a step.
g, count = utils.batched_weighted_sum(
proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
(self.noise.get(index, self.policy.num_params)
for index in noise_indices),
batch_size=500)
g /= noisy_returns.size
assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
and count == len(noise_indices))
# Compute the new weights theta.
theta, update_ratio = self.optimizer.update(-g +
config["l2_coeff"] * theta)
# Set the new weights in the local copy of the policy.
self.policy.set_weights(theta)
# Store the rewards
if len(all_eval_returns) > 0:
self.reward_list.append(np.mean(eval_returns))
# Now sync the filters
FilterManager.synchronize({
DEFAULT_POLICY_ID: self.policy.get_filter()
}, self._workers)
info = {
"weights_norm": np.square(theta).sum(),
"grad_norm": np.square(g).sum(),
"update_ratio": update_ratio,
"episodes_this_iter": noisy_lengths.size,
"episodes_so_far": self.episodes_so_far,
}
reward_mean = np.mean(self.reward_list[-self.report_length:])
result = dict(
episode_reward_mean=reward_mean,
episode_len_mean=eval_lengths.mean(),
timesteps_this_iter=noisy_lengths.sum(),
info=info)
return result
@override(Trainer)
def compute_action(self, observation):
return self.policy.compute(observation, update=False)[0]
@override(Trainer)
def _stop(self):
# workaround for https://github.com/ray-project/ray/issues/1516
for w in self._workers:
w.__ray_terminate__.remote()
def _collect_results(self, theta_id, min_episodes, min_timesteps):
num_episodes, num_timesteps = 0, 0
results = []
while num_episodes < min_episodes or num_timesteps < min_timesteps:
logger.info(
"Collected {} episodes {} timesteps so far this iter".format(
num_episodes, num_timesteps))
rollout_ids = [
worker.do_rollouts.remote(theta_id) for worker in self._workers
]
# Get the results of the rollouts.
for result in ray_get_and_free(rollout_ids):
results.append(result)
# Update the number of episodes and the number of timesteps
# keeping in mind that result.noisy_lengths is a list of lists,
# where the inner lists have length 2.
num_episodes += sum(len(pair) for pair in result.noisy_lengths)
num_timesteps += sum(
sum(pair) for pair in result.noisy_lengths)
return results, num_episodes, num_timesteps
def __getstate__(self):
return {
"weights": self.policy.get_weights(),
"filter": self.policy.get_filter(),
"episodes_so_far": self.episodes_so_far,
}
def __setstate__(self, state):
self.episodes_so_far = state["episodes_so_far"]
self.policy.set_weights(state["weights"])
self.policy.set_filter(state["filter"])
FilterManager.synchronize({
DEFAULT_POLICY_ID: self.policy.get_filter()
}, self._workers)
-56
View File
@@ -1,56 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
class Optimizer(object):
def __init__(self, pi):
self.pi = pi
self.dim = pi.num_params
self.t = 0
def update(self, globalg):
self.t += 1
step = self._compute_step(globalg)
theta = self.pi.get_weights()
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
return theta + step, ratio
def _compute_step(self, globalg):
raise NotImplementedError
class SGD(Optimizer):
def __init__(self, pi, stepsize, momentum=0.9):
Optimizer.__init__(self, pi)
self.v = np.zeros(self.dim, dtype=np.float32)
self.stepsize, self.momentum = stepsize, momentum
def _compute_step(self, globalg):
self.v = self.momentum * self.v + (1. - self.momentum) * globalg
step = -self.stepsize * self.v
return step
class Adam(Optimizer):
def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
Optimizer.__init__(self, pi)
self.stepsize = stepsize
self.beta1 = beta1
self.beta2 = beta2
self.epsilon = epsilon
self.m = np.zeros(self.dim, dtype=np.float32)
self.v = np.zeros(self.dim, dtype=np.float32)
def _compute_step(self, globalg):
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
(1 - self.beta1**self.t))
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
return step
-93
View File
@@ -1,93 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import gym
import numpy as np
import ray
import ray.experimental.tf_utils
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.filter import get_filter
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
def rollout(policy, env, timestep_limit=None, add_noise=False):
"""Do a rollout.
If add_noise is True, the rollout will take noisy actions with
noise drawn from that stream. Otherwise, no action noise will be added.
"""
env_timestep_limit = env.spec.max_episode_steps
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
timestep_limit, env_timestep_limit))
rews = []
t = 0
observation = env.reset()
for _ in range(timestep_limit or 999999):
ac = policy.compute(observation, add_noise=add_noise)[0]
observation, rew, done, _ = env.step(ac)
rews.append(rew)
t += 1
if done:
break
rews = np.array(rews, dtype=np.float32)
return rews, t
class GenericPolicy(object):
def __init__(self, sess, action_space, obs_space, preprocessor,
observation_filter, model_options, action_noise_std):
self.sess = sess
self.action_space = action_space
self.action_noise_std = action_noise_std
self.preprocessor = preprocessor
self.observation_filter = get_filter(observation_filter,
self.preprocessor.shape)
self.inputs = tf.placeholder(tf.float32,
[None] + list(self.preprocessor.shape))
# Policy network.
dist_class, dist_dim = ModelCatalog.get_action_dist(
self.action_space, model_options, dist_type="deterministic")
model = ModelCatalog.get_model({
"obs": self.inputs
}, obs_space, action_space, dist_dim, model_options)
dist = dist_class(model.outputs)
self.sampler = dist.sample()
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
model.outputs, self.sess)
self.num_params = sum(
np.prod(variable.shape.as_list())
for _, variable in self.variables.variables.items())
self.sess.run(tf.global_variables_initializer())
def compute(self, observation, add_noise=False, update=True):
observation = self.preprocessor.transform(observation)
observation = self.observation_filter(observation[None], update=update)
action = self.sess.run(
self.sampler, feed_dict={self.inputs: observation})
action = _unbatch_tuple_actions(action)
if add_noise and isinstance(self.action_space, gym.spaces.Box):
action += np.random.randn(*action.shape) * self.action_noise_std
return action
def set_weights(self, x):
self.variables.set_flat(x)
def get_weights(self):
return self.variables.get_flat()
def get_filter(self):
return self.observation_filter
def set_filter(self, observation_filter):
self.observation_filter = observation_filter
-63
View File
@@ -1,63 +0,0 @@
# Code in this file is copied and adapted from
# https://github.com/openai/evolution-strategies-starter.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
def compute_ranks(x):
"""Returns ranks in [0, len(x))
Note: This is different from scipy.stats.rankdata, which returns ranks in
[1, len(x)].
"""
assert x.ndim == 1
ranks = np.empty(len(x), dtype=int)
ranks[x.argsort()] = np.arange(len(x))
return ranks
def compute_centered_ranks(x):
y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
y /= (x.size - 1)
y -= 0.5
return y
def make_session(single_threaded):
if not single_threaded:
return tf.Session()
return tf.Session(
config=tf.ConfigProto(
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
def itergroups(items, group_size):
assert group_size >= 1
group = []
for x in items:
group.append(x)
if len(group) == group_size:
yield tuple(group)
del group[:]
if group:
yield tuple(group)
def batched_weighted_sum(weights, vecs, batch_size):
total = 0
num_items_summed = 0
for batch_weights, batch_vecs in zip(
itergroups(weights, batch_size), itergroups(vecs, batch_size)):
assert len(batch_weights) == len(batch_vecs) <= batch_size
total += np.dot(
np.asarray(batch_weights, dtype=np.float32),
np.asarray(batch_vecs, dtype=np.float32))
num_items_summed += len(batch_weights)
return total, num_items_summed
@@ -1,6 +0,0 @@
from ray.rllib.agents.impala.impala import ImpalaTrainer, DEFAULT_CONFIG
from ray.rllib.utils import renamed_agent
ImpalaAgent = renamed_agent(ImpalaTrainer)
__all__ = ["ImpalaAgent", "ImpalaTrainer", "DEFAULT_CONFIG"]
-164
View File
@@ -1,164 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy
from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.optimizers import AsyncSamplesOptimizer
from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
from ray.rllib.utils.annotations import override
from ray.tune.trainable import Trainable
from ray.tune.resources import Resources
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# V-trace params (see vtrace.py).
"vtrace": True,
"vtrace_clip_rho_threshold": 1.0,
"vtrace_clip_pg_rho_threshold": 1.0,
# System params.
#
# == Overview of data flow in IMPALA ==
# 1. Policy evaluation in parallel across `num_workers` actors produces
# batches of size `sample_batch_size * num_envs_per_worker`.
# 2. If enabled, the replay buffer stores and produces batches of size
# `sample_batch_size * num_envs_per_worker`.
# 3. If enabled, the minibatch ring buffer stores and replays batches of
# size `train_batch_size` up to `num_sgd_iter` times per batch.
# 4. The learner thread executes data parallel SGD across `num_gpus` GPUs
# on batches of size `train_batch_size`.
#
"sample_batch_size": 50,
"train_batch_size": 500,
"min_iter_time_s": 10,
"num_workers": 2,
# number of GPUs the learner should use.
"num_gpus": 1,
# set >1 to load data into GPUs in parallel. Increases GPU memory usage
# proportionally with the number of buffers.
"num_data_loader_buffers": 1,
# how many train batches should be retained for minibatching. This conf
# only has an effect if `num_sgd_iter > 1`.
"minibatch_buffer_size": 1,
# number of passes to make over each train batch
"num_sgd_iter": 1,
# set >0 to enable experience replay. Saved samples will be replayed with
# a p:1 proportion to new data samples.
"replay_proportion": 0.0,
# number of sample batches to store for replay. The number of transitions
# saved total will be (replay_buffer_num_slots * sample_batch_size).
"replay_buffer_num_slots": 0,
# max queue size for train batches feeding into the learner
"learner_queue_size": 16,
# wait for train batches to be available in minibatch buffer queue
# this many seconds. This may need to be increased e.g. when training
# with a slow environment
"learner_queue_timeout": 300,
# level of queuing for sampling.
"max_sample_requests_in_flight_per_worker": 2,
# max number of workers to broadcast one set of weights to
"broadcast_interval": 1,
# use intermediate actors for multi-level aggregation. This can make sense
# if ingesting >2GB/s of samples, or if the data requires decompression.
"num_aggregation_workers": 0,
# Learning params.
"grad_clip": 40.0,
# either "adam" or "rmsprop"
"opt_type": "adam",
"lr": 0.0005,
"lr_schedule": None,
# rmsprop considered
"decay": 0.99,
"momentum": 0.0,
"epsilon": 0.1,
# balancing the three losses
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"entropy_coeff_schedule": None,
# use fake (infinite speed) sampler for testing
"_fake_sampler": False,
})
# __sphinx_doc_end__
# yapf: enable
def choose_policy(config):
if config["vtrace"]:
return VTraceTFPolicy
else:
return A3CTFPolicy
def validate_config(config):
if config["entropy_coeff"] < 0:
raise DeprecationWarning("entropy_coeff must be >= 0")
def defer_make_workers(trainer, env_creator, policy, config):
# Defer worker creation to after the optimizer has been created.
return trainer._make_workers(env_creator, policy, config, 0)
def make_aggregators_and_optimizer(workers, config):
if config["num_aggregation_workers"] > 0:
# Create co-located aggregator actors first for placement pref
aggregators = TreeAggregator.precreate_aggregators(
config["num_aggregation_workers"])
else:
aggregators = None
workers.add_workers(config["num_workers"])
optimizer = AsyncSamplesOptimizer(
workers,
lr=config["lr"],
num_gpus=config["num_gpus"],
sample_batch_size=config["sample_batch_size"],
train_batch_size=config["train_batch_size"],
replay_buffer_num_slots=config["replay_buffer_num_slots"],
replay_proportion=config["replay_proportion"],
num_data_loader_buffers=config["num_data_loader_buffers"],
max_sample_requests_in_flight_per_worker=config[
"max_sample_requests_in_flight_per_worker"],
broadcast_interval=config["broadcast_interval"],
num_sgd_iter=config["num_sgd_iter"],
minibatch_buffer_size=config["minibatch_buffer_size"],
num_aggregation_workers=config["num_aggregation_workers"],
learner_queue_size=config["learner_queue_size"],
learner_queue_timeout=config["learner_queue_timeout"],
**config["optimizer"])
if aggregators:
# Assign the pre-created aggregators to the optimizer
optimizer.aggregator.init(aggregators)
return optimizer
class OverrideDefaultResourceRequest(object):
@classmethod
@override(Trainable)
def default_resource_request(cls, config):
cf = dict(cls._default_config, **config)
Trainer._validate_config(cf)
return Resources(
cpu=cf["num_cpus_for_driver"],
gpu=cf["num_gpus"],
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
cf["num_aggregation_workers"],
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
ImpalaTrainer = build_trainer(
name="IMPALA",
default_config=DEFAULT_CONFIG,
default_policy=VTraceTFPolicy,
validate_config=validate_config,
get_policy_class=choose_policy,
make_workers=defer_make_workers,
make_policy_optimizer=make_aggregators_and_optimizer,
mixins=[OverrideDefaultResourceRequest])
-409
View File
@@ -1,409 +0,0 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Functions to compute V-trace off-policy actor critic targets.
For details and theory see:
"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
See https://arxiv.org/abs/1802.01561 for the full paper.
In addition to the original paper's code, changes have been made
to support MultiDiscrete action spaces. behaviour_policy_logits,
target_policy_logits and actions parameters in the entry point
multi_from_logits method accepts lists of tensors instead of just
tensors.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
VTraceFromLogitsReturns = collections.namedtuple("VTraceFromLogitsReturns", [
"vs", "pg_advantages", "log_rhos", "behaviour_action_log_probs",
"target_action_log_probs"
])
VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages")
def log_probs_from_logits_and_actions(policy_logits,
actions,
dist_class=Categorical):
return multi_log_probs_from_logits_and_actions([policy_logits], [actions],
dist_class)[0]
def multi_log_probs_from_logits_and_actions(policy_logits, actions,
dist_class):
"""Computes action log-probs from policy logits and actions.
In the notation used throughout documentation and comments, T refers to the
time dimension ranging from 0 to T-1. B refers to the batch size and
ACTION_SPACE refers to the list of numbers each representing a number of
actions.
Args:
policy_logits: A list with length of ACTION_SPACE of float32
tensors of shapes
[T, B, ACTION_SPACE[0]],
...,
[T, B, ACTION_SPACE[-1]]
with un-normalized log-probabilities parameterizing a softmax policy.
actions: A list with length of ACTION_SPACE of
tensors of shapes
[T, B, ...],
...,
[T, B, ...]
with actions.
Returns:
A list with length of ACTION_SPACE of float32
tensors of shapes
[T, B],
...,
[T, B]
corresponding to the sampling log probability
of the chosen action w.r.t. the policy.
"""
log_probs = []
for i in range(len(policy_logits)):
p_shape = tf.shape(policy_logits[i])
a_shape = tf.shape(actions[i])
policy_logits_flat = tf.reshape(policy_logits[i],
tf.concat([[-1], p_shape[2:]], axis=0))
actions_flat = tf.reshape(actions[i],
tf.concat([[-1], a_shape[2:]], axis=0))
log_probs.append(
tf.reshape(
dist_class(policy_logits_flat).logp(actions_flat),
a_shape[:2]))
return log_probs
def from_logits(behaviour_policy_logits,
target_policy_logits,
actions,
discounts,
rewards,
values,
bootstrap_value,
dist_class=Categorical,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
name="vtrace_from_logits"):
"""multi_from_logits wrapper used only for tests"""
res = multi_from_logits(
[behaviour_policy_logits], [target_policy_logits], [actions],
discounts,
rewards,
values,
bootstrap_value,
dist_class,
clip_rho_threshold=clip_rho_threshold,
clip_pg_rho_threshold=clip_pg_rho_threshold,
name=name)
return VTraceFromLogitsReturns(
vs=res.vs,
pg_advantages=res.pg_advantages,
log_rhos=res.log_rhos,
behaviour_action_log_probs=tf.squeeze(
res.behaviour_action_log_probs, axis=0),
target_action_log_probs=tf.squeeze(
res.target_action_log_probs, axis=0),
)
def multi_from_logits(behaviour_policy_logits,
target_policy_logits,
actions,
discounts,
rewards,
values,
bootstrap_value,
dist_class,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
name="vtrace_from_logits"):
r"""V-trace for softmax policies.
Calculates V-trace actor critic targets for softmax polices as described in
"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
Target policy refers to the policy we are interested in improving and
behaviour policy refers to the policy that generated the given
rewards and actions.
In the notation used throughout documentation and comments, T refers to the
time dimension ranging from 0 to T-1. B refers to the batch size and
ACTION_SPACE refers to the list of numbers each representing a number of
actions.
Args:
behaviour_policy_logits: A list with length of ACTION_SPACE of float32
tensors of shapes
[T, B, ACTION_SPACE[0]],
...,
[T, B, ACTION_SPACE[-1]]
with un-normalized log-probabilities parameterizing the softmax behaviour
policy.
target_policy_logits: A list with length of ACTION_SPACE of float32
tensors of shapes
[T, B, ACTION_SPACE[0]],
...,
[T, B, ACTION_SPACE[-1]]
with un-normalized log-probabilities parameterizing the softmax target
policy.
actions: A list with length of ACTION_SPACE of
tensors of shapes
[T, B, ...],
...,
[T, B, ...]
with actions sampled from the behaviour policy.
discounts: A float32 tensor of shape [T, B] with the discount encountered
when following the behaviour policy.
rewards: A float32 tensor of shape [T, B] with the rewards generated by
following the behaviour policy.
values: A float32 tensor of shape [T, B] with the value function estimates
wrt. the target policy.
bootstrap_value: A float32 of shape [B] with the value function estimate at
time T.
dist_class: action distribution class for the logits.
clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
importance weights (rho) when calculating the baseline targets (vs).
rho^bar in the paper.
clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
name: The name scope that all V-trace operations will be created in.
Returns:
A `VTraceFromLogitsReturns` namedtuple with the following fields:
vs: A float32 tensor of shape [T, B]. Can be used as target to train a
baseline (V(x_t) - vs_t)^2.
pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an
estimate of the advantage in the calculation of policy gradients.
log_rhos: A float32 tensor of shape [T, B] containing the log importance
sampling weights (log rhos).
behaviour_action_log_probs: A float32 tensor of shape [T, B] containing
behaviour policy action log probabilities (log \mu(a_t)).
target_action_log_probs: A float32 tensor of shape [T, B] containing
target policy action probabilities (log \pi(a_t)).
"""
for i in range(len(behaviour_policy_logits)):
behaviour_policy_logits[i] = tf.convert_to_tensor(
behaviour_policy_logits[i], dtype=tf.float32)
target_policy_logits[i] = tf.convert_to_tensor(
target_policy_logits[i], dtype=tf.float32)
# Make sure tensor ranks are as expected.
# The rest will be checked by from_action_log_probs.
behaviour_policy_logits[i].shape.assert_has_rank(3)
target_policy_logits[i].shape.assert_has_rank(3)
with tf.name_scope(
name,
values=[
behaviour_policy_logits, target_policy_logits, actions,
discounts, rewards, values, bootstrap_value
]):
target_action_log_probs = multi_log_probs_from_logits_and_actions(
target_policy_logits, actions, dist_class)
behaviour_action_log_probs = multi_log_probs_from_logits_and_actions(
behaviour_policy_logits, actions, dist_class)
log_rhos = get_log_rhos(target_action_log_probs,
behaviour_action_log_probs)
vtrace_returns = from_importance_weights(
log_rhos=log_rhos,
discounts=discounts,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
clip_rho_threshold=clip_rho_threshold,
clip_pg_rho_threshold=clip_pg_rho_threshold)
return VTraceFromLogitsReturns(
log_rhos=log_rhos,
behaviour_action_log_probs=behaviour_action_log_probs,
target_action_log_probs=target_action_log_probs,
**vtrace_returns._asdict())
def from_importance_weights(log_rhos,
discounts,
rewards,
values,
bootstrap_value,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
name="vtrace_from_importance_weights"):
r"""V-trace from log importance weights.
Calculates V-trace actor critic targets as described in
"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
In the notation used throughout documentation and comments, T refers to the
time dimension ranging from 0 to T-1. B refers to the batch size. This code
also supports the case where all tensors have the same number of additional
dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C],
`bootstrap_value` is [B, C].
Args:
log_rhos: A float32 tensor of shape [T, B] representing the
log importance sampling weights, i.e.
log(target_policy(a) / behaviour_policy(a)). V-trace performs operations
on rhos in log-space for numerical stability.
discounts: A float32 tensor of shape [T, B] with discounts encountered when
following the behaviour policy.
rewards: A float32 tensor of shape [T, B] containing rewards generated by
following the behaviour policy.
values: A float32 tensor of shape [T, B] with the value function estimates
wrt. the target policy.
bootstrap_value: A float32 of shape [B] with the value function estimate at
time T.
clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
importance weights (rho) when calculating the baseline targets (vs).
rho^bar in the paper. If None, no clipping is applied.
clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If
None, no clipping is applied.
name: The name scope that all V-trace operations will be created in.
Returns:
A VTraceReturns namedtuple (vs, pg_advantages) where:
vs: A float32 tensor of shape [T, B]. Can be used as target to
train a baseline (V(x_t) - vs_t)^2.
pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
advantage in the calculation of policy gradients.
"""
log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
values = tf.convert_to_tensor(values, dtype=tf.float32)
bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)
if clip_rho_threshold is not None:
clip_rho_threshold = tf.convert_to_tensor(
clip_rho_threshold, dtype=tf.float32)
if clip_pg_rho_threshold is not None:
clip_pg_rho_threshold = tf.convert_to_tensor(
clip_pg_rho_threshold, dtype=tf.float32)
# Make sure tensor ranks are consistent.
rho_rank = log_rhos.shape.ndims # Usually 2.
values.shape.assert_has_rank(rho_rank)
bootstrap_value.shape.assert_has_rank(rho_rank - 1)
discounts.shape.assert_has_rank(rho_rank)
rewards.shape.assert_has_rank(rho_rank)
if clip_rho_threshold is not None:
clip_rho_threshold.shape.assert_has_rank(0)
if clip_pg_rho_threshold is not None:
clip_pg_rho_threshold.shape.assert_has_rank(0)
with tf.name_scope(
name,
values=[log_rhos, discounts, rewards, values, bootstrap_value]):
rhos = tf.exp(log_rhos)
if clip_rho_threshold is not None:
clipped_rhos = tf.minimum(
clip_rho_threshold, rhos, name="clipped_rhos")
tf.summary.histogram("clipped_rhos_1000", tf.minimum(1000.0, rhos))
tf.summary.scalar(
"num_of_clipped_rhos",
tf.reduce_sum(
tf.cast(
tf.equal(clipped_rhos, clip_rho_threshold), tf.int32)))
tf.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
else:
clipped_rhos = rhos
cs = tf.minimum(1.0, rhos, name="cs")
# Append bootstrapped value to get [v1, ..., v_t+1]
values_t_plus_1 = tf.concat(
[values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
deltas = clipped_rhos * (
rewards + discounts * values_t_plus_1 - values)
# All sequences are reversed, computation starts from the back.
sequences = (
tf.reverse(discounts, axis=[0]),
tf.reverse(cs, axis=[0]),
tf.reverse(deltas, axis=[0]),
)
# V-trace vs are calculated through a scan from the back to the
# beginning of the given trajectory.
def scanfunc(acc, sequence_item):
discount_t, c_t, delta_t = sequence_item
return delta_t + discount_t * c_t * acc
initial_values = tf.zeros_like(bootstrap_value)
vs_minus_v_xs = tf.scan(
fn=scanfunc,
elems=sequences,
initializer=initial_values,
parallel_iterations=1,
back_prop=False,
name="scan")
# Reverse the results back to original order.
vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0], name="vs_minus_v_xs")
# Add V(x_s) to get v_s.
vs = tf.add(vs_minus_v_xs, values, name="vs")
# Advantage for policy gradient.
vs_t_plus_1 = tf.concat(
[vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
if clip_pg_rho_threshold is not None:
clipped_pg_rhos = tf.minimum(
clip_pg_rho_threshold, rhos, name="clipped_pg_rhos")
else:
clipped_pg_rhos = rhos
pg_advantages = (
clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
# Make sure no gradients backpropagated through the returned values.
return VTraceReturns(
vs=tf.stop_gradient(vs),
pg_advantages=tf.stop_gradient(pg_advantages))
def get_log_rhos(target_action_log_probs, behaviour_action_log_probs):
"""With the selected log_probs for multi-discrete actions of behaviour
and target policies we compute the log_rhos for calculating the vtrace."""
t = tf.stack(target_action_log_probs)
b = tf.stack(behaviour_action_log_probs)
log_rhos = tf.reduce_sum(t - b, axis=0)
return log_rhos
@@ -1,303 +0,0 @@
"""Adapted from A3CTFPolicy to add V-trace.
Keep in sync with changes to A3CTFPolicy and VtraceSurrogatePolicy."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import logging
import gym
import ray
from ray.rllib.agents.impala import vtrace
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
EntropyCoeffSchedule
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
logger = logging.getLogger(__name__)
BEHAVIOUR_LOGITS = "behaviour_logits"
class VTraceLoss(object):
def __init__(self,
actions,
actions_logp,
actions_entropy,
dones,
behaviour_logits,
target_logits,
discount,
rewards,
values,
bootstrap_value,
dist_class,
valid_mask,
vf_loss_coeff=0.5,
entropy_coeff=0.01,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0):
"""Policy gradient loss with vtrace importance weighting.
VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
batch_size. The reason we need to know `B` is for V-trace to properly
handle episode cut boundaries.
Args:
actions: An int|float32 tensor of shape [T, B, ACTION_SPACE].
actions_logp: A float32 tensor of shape [T, B].
actions_entropy: A float32 tensor of shape [T, B].
dones: A bool tensor of shape [T, B].
behaviour_logits: A list with length of ACTION_SPACE of float32
tensors of shapes
[T, B, ACTION_SPACE[0]],
...,
[T, B, ACTION_SPACE[-1]]
target_logits: A list with length of ACTION_SPACE of float32
tensors of shapes
[T, B, ACTION_SPACE[0]],
...,
[T, B, ACTION_SPACE[-1]]
discount: A float32 scalar.
rewards: A float32 tensor of shape [T, B].
values: A float32 tensor of shape [T, B].
bootstrap_value: A float32 tensor of shape [B].
dist_class: action distribution class for logits.
valid_mask: A bool tensor of valid RNN input elements (#2992).
"""
# Compute vtrace on the CPU for better perf.
with tf.device("/cpu:0"):
self.vtrace_returns = vtrace.multi_from_logits(
behaviour_policy_logits=behaviour_logits,
target_policy_logits=target_logits,
actions=tf.unstack(actions, axis=2),
discounts=tf.to_float(~dones) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
dist_class=dist_class,
clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
tf.float32))
self.value_targets = self.vtrace_returns.vs
# The policy gradients loss
self.pi_loss = -tf.reduce_sum(
tf.boolean_mask(actions_logp * self.vtrace_returns.pg_advantages,
valid_mask))
# The baseline loss
delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask)
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
# The entropy loss
self.entropy = tf.reduce_sum(
tf.boolean_mask(actions_entropy, valid_mask))
# The summed weighted loss
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
self.entropy * entropy_coeff)
def _make_time_major(policy, tensor, drop_last=False):
"""Swaps batch and trajectory axis.
Arguments:
policy: Policy reference
tensor: A tensor or list of tensors to reshape.
drop_last: A bool indicating whether to drop the last
trajectory item.
Returns:
res: A tensor with swapped axes or a list of tensors with
swapped axes.
"""
if isinstance(tensor, list):
return [_make_time_major(policy, t, drop_last) for t in tensor]
if policy.state_in:
B = tf.shape(policy.seq_lens)[0]
T = tf.shape(tensor)[0] // B
else:
# Important: chop the tensor into batches at known episode cut
# boundaries. TODO(ekl) this is kind of a hack
T = policy.config["sample_batch_size"]
B = tf.shape(tensor)[0] // T
rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
# swap B and T axes
res = tf.transpose(
rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
if drop_last:
return res[:-1]
return res
def build_vtrace_loss(policy, batch_tensors):
if isinstance(policy.action_space, gym.spaces.Discrete):
is_multidiscrete = False
output_hidden_shape = [policy.action_space.n]
elif isinstance(policy.action_space,
gym.spaces.multi_discrete.MultiDiscrete):
is_multidiscrete = True
output_hidden_shape = policy.action_space.nvec.astype(np.int32)
else:
is_multidiscrete = False
output_hidden_shape = 1
def make_time_major(*args, **kw):
return _make_time_major(policy, *args, **kw)
actions = batch_tensors[SampleBatch.ACTIONS]
dones = batch_tensors[SampleBatch.DONES]
rewards = batch_tensors[SampleBatch.REWARDS]
behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
unpacked_behaviour_logits = tf.split(
behaviour_logits, output_hidden_shape, axis=1)
unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1)
action_dist = policy.action_dist
values = policy.value_function
if policy.state_in:
max_seq_len = tf.reduce_max(policy.seq_lens) - 1
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
mask = tf.reshape(mask, [-1])
else:
mask = tf.ones_like(rewards)
# Prepare actions for loss
loss_actions = actions if is_multidiscrete else tf.expand_dims(
actions, axis=1)
# Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
policy.loss = VTraceLoss(
actions=make_time_major(loss_actions, drop_last=True),
actions_logp=make_time_major(
action_dist.logp(actions), drop_last=True),
actions_entropy=make_time_major(
action_dist.multi_entropy(), drop_last=True),
dones=make_time_major(dones, drop_last=True),
behaviour_logits=make_time_major(
unpacked_behaviour_logits, drop_last=True),
target_logits=make_time_major(unpacked_outputs, drop_last=True),
discount=policy.config["gamma"],
rewards=make_time_major(rewards, drop_last=True),
values=make_time_major(values, drop_last=True),
bootstrap_value=make_time_major(values)[-1],
dist_class=Categorical if is_multidiscrete else policy.dist_class,
valid_mask=make_time_major(mask, drop_last=True),
vf_loss_coeff=policy.config["vf_loss_coeff"],
entropy_coeff=policy.entropy_coeff,
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
return policy.loss.total_loss
def stats(policy, batch_tensors):
values_batched = _make_time_major(
policy, policy.value_function, drop_last=policy.config["vtrace"])
return {
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"entropy": policy.loss.entropy,
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
"var_gnorm": tf.global_norm(policy.var_list),
"vf_loss": policy.loss.vf_loss,
"vf_explained_var": explained_variance(
tf.reshape(policy.loss.value_targets, [-1]),
tf.reshape(values_batched, [-1])),
}
def grad_stats(policy, grads):
return {
"grad_gnorm": tf.global_norm(grads),
}
def postprocess_trajectory(policy,
sample_batch,
other_agent_batches=None,
episode=None):
# not used, so save some bandwidth
del sample_batch.data[SampleBatch.NEXT_OBS]
return sample_batch
def add_behaviour_logits(policy):
return {BEHAVIOUR_LOGITS: policy.model_out}
def validate_config(policy, obs_space, action_space, config):
if config["vtrace"]:
assert config["batch_mode"] == "truncate_episodes", \
"Must use `truncate_episodes` batch mode with V-trace."
def choose_optimizer(policy, config):
if policy.config["opt_type"] == "adam":
return tf.train.AdamOptimizer(policy.cur_lr)
else:
return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
config["momentum"], config["epsilon"])
def clip_gradients(policy, optimizer, loss):
grads = tf.gradients(loss, policy.var_list)
policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
clipped_grads = list(zip(policy.grads, policy.var_list))
return clipped_grads
class ValueNetworkMixin(object):
def __init__(self):
self.value_function = self.model.value_function()
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
def value(self, ob, *args):
feed_dict = {
self.get_placeholder(SampleBatch.CUR_OBS): [ob],
self.seq_lens: [1]
}
assert len(args) == len(self.state_in), \
(args, self.state_in)
for k, v in zip(self.state_in, args):
feed_dict[k] = v
vf = self.get_session().run(self.value_function, feed_dict)
return vf[0]
def setup_mixins(policy, obs_space, action_space, config):
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
config["entropy_coeff_schedule"])
ValueNetworkMixin.__init__(policy)
VTraceTFPolicy = build_tf_policy(
name="VTraceTFPolicy",
get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
loss_fn=build_vtrace_loss,
stats_fn=stats,
grad_stats_fn=grad_stats,
postprocess_fn=postprocess_trajectory,
optimizer_fn=choose_optimizer,
gradients_fn=clip_gradients,
extra_action_fetches_fn=add_behaviour_logits,
before_init=validate_config,
before_loss_init=setup_mixins,
mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
@@ -1,270 +0,0 @@
# Copyright 2018 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tests for V-trace.
For details and theory see:
"IMPALA: Scalable Distributed Deep-RL with
Importance Weighted Actor-Learner Architectures"
by Espeholt, Soyer, Munos et al.
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from absl.testing import parameterized
import numpy as np
import vtrace
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
def _shaped_arange(*shape):
"""Runs np.arange, converts to float and reshapes."""
return np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
def _softmax(logits):
"""Applies softmax non-linearity on inputs."""
return np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
def _ground_truth_calculation(discounts, log_rhos, rewards, values,
bootstrap_value, clip_rho_threshold,
clip_pg_rho_threshold):
"""Calculates the ground truth for V-trace in Python/Numpy."""
vs = []
seq_len = len(discounts)
rhos = np.exp(log_rhos)
cs = np.minimum(rhos, 1.0)
clipped_rhos = rhos
if clip_rho_threshold:
clipped_rhos = np.minimum(rhos, clip_rho_threshold)
clipped_pg_rhos = rhos
if clip_pg_rho_threshold:
clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold)
# This is a very inefficient way to calculate the V-trace ground truth.
# We calculate it this way because it is close to the mathematical notation
# of
# V-trace.
# v_s = V(x_s)
# + \sum^{T-1}_{t=s} \gamma^{t-s}
# * \prod_{i=s}^{t-1} c_i
# * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
# Note that when we take the product over c_i, we write `s:t` as the
# notation
# of the paper is inclusive of the `t-1`, but Python is exclusive.
# Also note that np.prod([]) == 1.
values_t_plus_1 = np.concatenate(
[values, bootstrap_value[None, :]], axis=0)
for s in range(seq_len):
v_s = np.copy(values[s]) # Very important copy.
for t in range(s, seq_len):
v_s += (np.prod(discounts[s:t], axis=0) * np.prod(cs[s:t], axis=0)
* clipped_rhos[t] * (rewards[t] + discounts[t] *
values_t_plus_1[t + 1] - values[t]))
vs.append(v_s)
vs = np.stack(vs, axis=0)
pg_advantages = (clipped_pg_rhos * (rewards + discounts * np.concatenate(
[vs[1:], bootstrap_value[None, :]], axis=0) - values))
return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages)
class LogProbsFromLogitsAndActionsTest(tf.test.TestCase,
parameterized.TestCase):
@parameterized.named_parameters(("Batch1", 1), ("Batch2", 2))
def test_log_probs_from_logits_and_actions(self, batch_size):
"""Tests log_probs_from_logits_and_actions."""
seq_len = 7
num_actions = 3
policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10
actions = np.random.randint(
0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32)
action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
policy_logits, actions)
# Ground Truth
# Using broadcasting to create a mask that indexes action logits
action_index_mask = actions[..., None] == np.arange(num_actions)
def index_with_mask(array, mask):
return array[mask].reshape(*array.shape[:-1])
# Note: Normally log(softmax) is not a good idea because it's not
# numerically stable. However, in this test we have well-behaved
# values.
ground_truth_v = index_with_mask(
np.log(_softmax(policy_logits)), action_index_mask)
with self.test_session() as session:
self.assertAllClose(ground_truth_v,
session.run(action_log_probs_tensor))
class VtraceTest(tf.test.TestCase, parameterized.TestCase):
@parameterized.named_parameters(("Batch1", 1), ("Batch5", 5))
def test_vtrace(self, batch_size):
"""Tests V-trace against ground truth data calculated in python."""
seq_len = 5
# Create log_rhos such that rho will span from near-zero to above the
# clipping thresholds. In particular, calculate log_rhos in
# [-2.5, 2.5),
# so that rho is in approx [0.08, 12.2).
log_rhos = _shaped_arange(seq_len, batch_size) / (batch_size * seq_len)
log_rhos = 5 * (log_rhos - 0.5) # [0.0, 1.0) -> [-2.5, 2.5).
values = {
"log_rhos": log_rhos,
# T, B where B_i: [0.9 / (i+1)] * T
"discounts": np.array([[0.9 / (b + 1) for b in range(batch_size)]
for _ in range(seq_len)]),
"rewards": _shaped_arange(seq_len, batch_size),
"values": _shaped_arange(seq_len, batch_size) / batch_size,
"bootstrap_value": _shaped_arange(batch_size) + 1.0,
"clip_rho_threshold": 3.7,
"clip_pg_rho_threshold": 2.2,
}
output = vtrace.from_importance_weights(**values)
with self.test_session() as session:
output_v = session.run(output)
ground_truth_v = _ground_truth_calculation(**values)
for a, b in zip(ground_truth_v, output_v):
self.assertAllClose(a, b)
@parameterized.named_parameters(("Batch1", 1), ("Batch2", 2))
def test_vtrace_from_logits(self, batch_size):
"""Tests V-trace calculated from logits."""
seq_len = 5
num_actions = 3
clip_rho_threshold = None # No clipping.
clip_pg_rho_threshold = None # No clipping.
# Intentionally leaving shapes unspecified to test if V-trace can
# deal with that.
placeholders = {
# T, B, NUM_ACTIONS
"behaviour_policy_logits": tf.placeholder(
dtype=tf.float32, shape=[None, None, None]),
# T, B, NUM_ACTIONS
"target_policy_logits": tf.placeholder(
dtype=tf.float32, shape=[None, None, None]),
"actions": tf.placeholder(dtype=tf.int32, shape=[None, None]),
"discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]),
"rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]),
"values": tf.placeholder(dtype=tf.float32, shape=[None, None]),
"bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]),
}
from_logits_output = vtrace.from_logits(
clip_rho_threshold=clip_rho_threshold,
clip_pg_rho_threshold=clip_pg_rho_threshold,
**placeholders)
target_log_probs = vtrace.log_probs_from_logits_and_actions(
placeholders["target_policy_logits"], placeholders["actions"])
behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
placeholders["behaviour_policy_logits"], placeholders["actions"])
log_rhos = target_log_probs - behaviour_log_probs
ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)
values = {
"behaviour_policy_logits": _shaped_arange(seq_len, batch_size,
num_actions),
"target_policy_logits": _shaped_arange(seq_len, batch_size,
num_actions),
"actions": np.random.randint(
0, num_actions - 1, size=(seq_len, batch_size)),
"discounts": np.array( # T, B where B_i: [0.9 / (i+1)] * T
[[0.9 / (b + 1) for b in range(batch_size)]
for _ in range(seq_len)]),
"rewards": _shaped_arange(seq_len, batch_size),
"values": _shaped_arange(seq_len, batch_size) / batch_size,
"bootstrap_value": _shaped_arange(batch_size) + 1.0, # B
}
feed_dict = {placeholders[k]: v for k, v in values.items()}
with self.test_session() as session:
from_logits_output_v = session.run(
from_logits_output, feed_dict=feed_dict)
(ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
ground_truth_target_action_log_probs) = session.run(
ground_truth, feed_dict=feed_dict)
# Calculate V-trace using the ground truth logits.
from_iw = vtrace.from_importance_weights(
log_rhos=ground_truth_log_rhos,
discounts=values["discounts"],
rewards=values["rewards"],
values=values["values"],
bootstrap_value=values["bootstrap_value"],
clip_rho_threshold=clip_rho_threshold,
clip_pg_rho_threshold=clip_pg_rho_threshold)
with self.test_session() as session:
from_iw_v = session.run(from_iw)
self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
self.assertAllClose(from_iw_v.pg_advantages,
from_logits_output_v.pg_advantages)
self.assertAllClose(ground_truth_behaviour_action_log_probs,
from_logits_output_v.behaviour_action_log_probs)
self.assertAllClose(ground_truth_target_action_log_probs,
from_logits_output_v.target_action_log_probs)
self.assertAllClose(ground_truth_log_rhos,
from_logits_output_v.log_rhos)
def test_higher_rank_inputs_for_importance_weights(self):
"""Checks support for additional dimensions in inputs."""
placeholders = {
"log_rhos": tf.placeholder(
dtype=tf.float32, shape=[None, None, 1]),
"discounts": tf.placeholder(
dtype=tf.float32, shape=[None, None, 1]),
"rewards": tf.placeholder(
dtype=tf.float32, shape=[None, None, 42]),
"values": tf.placeholder(dtype=tf.float32, shape=[None, None, 42]),
"bootstrap_value": tf.placeholder(
dtype=tf.float32, shape=[None, 42])
}
output = vtrace.from_importance_weights(**placeholders)
self.assertEqual(output.vs.shape.as_list()[-1], 42)
def test_inconsistent_rank_inputs_for_importance_weights(self):
"""Test one of many possible errors in shape of inputs."""
placeholders = {
"log_rhos": tf.placeholder(
dtype=tf.float32, shape=[None, None, 1]),
"discounts": tf.placeholder(
dtype=tf.float32, shape=[None, None, 1]),
"rewards": tf.placeholder(
dtype=tf.float32, shape=[None, None, 42]),
"values": tf.placeholder(dtype=tf.float32, shape=[None, None, 42]),
# Should be [None, 42].
"bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None])
}
with self.assertRaisesRegexp(ValueError, "must have rank 2"):
vtrace.from_importance_weights(**placeholders)
if __name__ == "__main__":
tf.test.main()
@@ -1,7 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.marwil.marwil import MARWILTrainer, DEFAULT_CONFIG
__all__ = ["MARWILTrainer", "DEFAULT_CONFIG"]
-55
View File
@@ -1,55 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.marwil.marwil_policy import MARWILPolicy
from ray.rllib.optimizers import SyncBatchReplayOptimizer
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# You should override this to point to an offline dataset (see agent.py).
"input": "sampler",
# Use importance sampling estimators for reward
"input_evaluation": ["is", "wis"],
# Scaling of advantages in exponential terms
# When beta is 0, MARWIL is reduced to imitation learning
"beta": 1.0,
# Balancing value estimation loss and policy optimization loss
"vf_coeff": 1.0,
# Whether to calculate cumulative rewards
"postprocess_inputs": True,
# Whether to rollout "complete_episodes" or "truncate_episodes"
"batch_mode": "complete_episodes",
# Learning rate for adam optimizer
"lr": 1e-4,
# Number of timesteps collected for each SGD round
"train_batch_size": 2000,
# Number of steps max to keep in the batch replay buffer
"replay_buffer_size": 100000,
# Number of steps to read before learning starts
"learning_starts": 0,
# === Parallelism ===
"num_workers": 0,
})
# __sphinx_doc_end__
# yapf: enable
def make_optimizer(workers, config):
return SyncBatchReplayOptimizer(
workers,
learning_starts=config["learning_starts"],
buffer_size=config["replay_buffer_size"],
train_batch_size=config["train_batch_size"],
)
MARWILTrainer = build_trainer(
name="MARWIL",
default_config=DEFAULT_CONFIG,
default_policy=MARWILPolicy,
make_policy_optimizer=make_optimizer)
@@ -1,175 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray
from ray.rllib.models import ModelCatalog
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
from ray.rllib.utils.annotations import override
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.tf_policy import TFPolicy
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.tf_ops import scope_vars
tf = try_import_tf()
POLICY_SCOPE = "p_func"
VALUE_SCOPE = "v_func"
class ValueLoss(object):
def __init__(self, state_values, cumulative_rewards):
self.loss = 0.5 * tf.reduce_mean(
tf.square(state_values - cumulative_rewards))
class ReweightedImitationLoss(object):
def __init__(self, state_values, cumulative_rewards, logits, actions,
action_space, beta):
ma_adv_norm = tf.get_variable(
name="moving_average_of_advantage_norm",
dtype=tf.float32,
initializer=100.0,
trainable=False)
# advantage estimation
adv = cumulative_rewards - state_values
# update averaged advantage norm
update_adv_norm = tf.assign_add(
ref=ma_adv_norm,
value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm))
# exponentially weighted advantages
with tf.control_dependencies([update_adv_norm]):
exp_advs = tf.exp(
beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))
# log\pi_\theta(a|s)
dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
action_dist = dist_cls(logits)
logprobs = action_dist.logp(actions)
self.loss = -1.0 * tf.reduce_mean(
tf.stop_gradient(exp_advs) * logprobs)
class MARWILPostprocessing(object):
"""Adds the advantages field to the trajectory."""
@override(Policy)
def postprocess_trajectory(self,
sample_batch,
other_agent_batches=None,
episode=None):
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
raise NotImplementedError(
"last done mask in a batch should be True. "
"For now, we only support reading experience batches produced "
"with batch_mode='complete_episodes'.",
len(sample_batch[SampleBatch.DONES]),
sample_batch[SampleBatch.DONES][-1])
batch = compute_advantages(
sample_batch, last_r, gamma=self.config["gamma"], use_gae=False)
return batch
class MARWILPolicy(MARWILPostprocessing, TFPolicy):
def __init__(self, observation_space, action_space, config):
config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
self.config = config
dist_cls, logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
# Action inputs
self.obs_t = tf.placeholder(
tf.float32, shape=(None, ) + observation_space.shape)
prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
prev_rewards_ph = tf.placeholder(
tf.float32, [None], name="prev_reward")
with tf.variable_scope(POLICY_SCOPE) as scope:
self.model = ModelCatalog.get_model({
"obs": self.obs_t,
"prev_actions": prev_actions_ph,
"prev_rewards": prev_rewards_ph,
"is_training": self._get_is_training_placeholder(),
}, observation_space, action_space, logit_dim,
self.config["model"])
logits = self.model.outputs
self.p_func_vars = scope_vars(scope.name)
# Action outputs
action_dist = dist_cls(logits)
self.output_actions = action_dist.sample()
# Training inputs
self.act_t = ModelCatalog.get_action_placeholder(action_space)
self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")
# v network evaluation
with tf.variable_scope(VALUE_SCOPE) as scope:
state_values = self.model.value_function()
self.v_func_vars = scope_vars(scope.name)
self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
logits, self.act_t, action_space)
# which kind of objective to optimize
objective = (
self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss)
self.explained_variance = tf.reduce_mean(
explained_variance(self.cum_rew_t, state_values))
# initialize TFPolicy
self.sess = tf.get_default_session()
self.loss_inputs = [
(SampleBatch.CUR_OBS, self.obs_t),
(SampleBatch.ACTIONS, self.act_t),
(Postprocessing.ADVANTAGES, self.cum_rew_t),
]
TFPolicy.__init__(
self,
observation_space,
action_space,
self.sess,
obs_input=self.obs_t,
action_sampler=self.output_actions,
action_prob=action_dist.sampled_action_prob(),
loss=objective,
model=self.model,
loss_inputs=self.loss_inputs,
state_inputs=self.model.state_in,
state_outputs=self.model.state_out,
prev_action_input=prev_actions_ph,
prev_reward_input=prev_rewards_ph)
self.sess.run(tf.global_variables_initializer())
self.stats_fetches = {
"total_loss": objective,
"vf_explained_var": self.explained_variance,
"policy_loss": self.p_loss.loss,
"vf_loss": self.v_loss.loss
}
def _build_value_loss(self, state_values, cum_rwds):
return ValueLoss(state_values, cum_rwds)
def _build_policy_loss(self, state_values, cum_rwds, logits, actions,
action_space):
return ReweightedImitationLoss(state_values, cum_rwds, logits, actions,
action_space, self.config["beta"])
@override(TFPolicy)
def extra_compute_grad_fetches(self):
return {LEARNER_STATS_KEY: self.stats_fetches}
@override(Policy)
def get_initial_state(self):
return self.model.state_init
-128
View File
@@ -1,128 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pickle
import numpy as np
from ray.tune import result as tune_result
from ray.rllib.agents.trainer import Trainer, with_common_config
class _MockTrainer(Trainer):
"""Mock trainer for use in tests"""
_name = "MockTrainer"
_default_config = with_common_config({
"mock_error": False,
"persistent_error": False,
"test_variable": 1,
"num_workers": 0,
"user_checkpoint_freq": 0,
})
@classmethod
def default_resource_request(cls, config):
return None
def _init(self, config, env_creator):
self.info = None
self.restored = False
def _train(self):
if self.config["mock_error"] and self.iteration == 1 \
and (self.config["persistent_error"] or not self.restored):
raise Exception("mock error")
result = dict(
episode_reward_mean=10,
episode_len_mean=10,
timesteps_this_iter=10,
info={})
if self.config["user_checkpoint_freq"] > 0 and self.iteration > 0:
if self.iteration % self.config["user_checkpoint_freq"] == 0:
result.update({tune_result.SHOULD_CHECKPOINT: True})
return result
def _save(self, checkpoint_dir):
path = os.path.join(checkpoint_dir, "mock_agent.pkl")
with open(path, "wb") as f:
pickle.dump(self.info, f)
return path
def _restore(self, checkpoint_path):
with open(checkpoint_path, "rb") as f:
info = pickle.load(f)
self.info = info
self.restored = True
def _register_if_needed(self, env_object):
pass
def set_info(self, info):
self.info = info
return info
def get_info(self):
return self.info
class _SigmoidFakeData(_MockTrainer):
"""Trainer that returns sigmoid learning curves.
This can be helpful for evaluating early stopping algorithms."""
_name = "SigmoidFakeData"
_default_config = with_common_config({
"width": 100,
"height": 100,
"offset": 0,
"iter_time": 10,
"iter_timesteps": 1,
"num_workers": 0,
})
def _train(self):
i = max(0, self.iteration - self.config["offset"])
v = np.tanh(float(i) / self.config["width"])
v *= self.config["height"]
return dict(
episode_reward_mean=v,
episode_len_mean=v,
timesteps_this_iter=self.config["iter_timesteps"],
time_this_iter_s=self.config["iter_time"],
info={})
class _ParameterTuningTrainer(_MockTrainer):
_name = "ParameterTuningTrainer"
_default_config = with_common_config({
"reward_amt": 10,
"dummy_param": 10,
"dummy_param2": 15,
"iter_time": 10,
"iter_timesteps": 1,
"num_workers": 0,
})
def _train(self):
return dict(
episode_reward_mean=self.config["reward_amt"] * self.iteration,
episode_len_mean=self.config["reward_amt"],
timesteps_this_iter=self.config["iter_timesteps"],
time_this_iter_s=self.config["iter_time"],
info={})
def _agent_import_failed(trace):
"""Returns dummy agent class for if PyTorch etc. is not installed."""
class _AgentImportFailed(Trainer):
_name = "AgentImportFailed"
_default_config = with_common_config({})
def _setup(self, config):
raise ImportError(trace)
return _AgentImportFailed
-6
View File
@@ -1,6 +0,0 @@
from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG
from ray.rllib.utils import renamed_agent
PGAgent = renamed_agent(PGTrainer)
__all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"]
-35
View File
@@ -1,35 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.pg.pg_policy import PGTFPolicy
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# No remote workers by default
"num_workers": 0,
# Learning rate
"lr": 0.0004,
# Use PyTorch as backend
"use_pytorch": False,
})
# __sphinx_doc_end__
# yapf: enable
def get_policy_class(config):
if config["use_pytorch"]:
from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
return PGTorchPolicy
else:
return PGTFPolicy
PGTrainer = build_trainer(
name="PG",
default_config=DEFAULT_CONFIG,
default_policy=PGTFPolicy,
get_policy_class=get_policy_class)
-35
View File
@@ -1,35 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
# The basic policy gradients loss
def policy_gradient_loss(policy, batch_tensors):
actions = batch_tensors[SampleBatch.ACTIONS]
advantages = batch_tensors[Postprocessing.ADVANTAGES]
return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
# This adds the "advantages" column to the sample batch.
def postprocess_advantages(policy,
sample_batch,
other_agent_batches=None,
episode=None):
return compute_advantages(
sample_batch, 0.0, policy.config["gamma"], use_gae=False)
PGTFPolicy = build_tf_policy(
name="PGTFPolicy",
get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
postprocess_fn=postprocess_advantages,
loss_fn=policy_gradient_loss)
@@ -1,42 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import ray
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_policy_template import build_torch_policy
def pg_torch_loss(policy, batch_tensors):
logits, _ = policy.model({
SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
})
action_dist = policy.dist_class(logits)
log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS])
# save the error in the policy object
policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
log_probs.reshape(-1))
return policy.pi_err
def postprocess_advantages(policy,
sample_batch,
other_agent_batches=None,
episode=None):
return compute_advantages(
sample_batch, 0.0, policy.config["gamma"], use_gae=False)
def pg_loss_stats(policy, batch_tensors):
# the error is recorded when computing the loss
return {"policy_loss": policy.pi_err.item()}
PGTorchPolicy = build_torch_policy(
name="PGTorchPolicy",
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
loss_fn=pg_torch_loss,
stats_fn=pg_loss_stats,
postprocess_fn=postprocess_advantages)
-7
View File
@@ -1,7 +0,0 @@
from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
from ray.rllib.agents.ppo.appo import APPOTrainer
from ray.rllib.utils import renamed_agent
PPOAgent = renamed_agent(PPOTrainer)
__all__ = ["PPOAgent", "APPOTrainer", "PPOTrainer", "DEFAULT_CONFIG"]
-91
View File
@@ -1,91 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.ppo.appo_policy import AsyncPPOTFPolicy
from ray.rllib.agents.trainer import with_base_config
from ray.rllib.agents.ppo.ppo import update_kl
from ray.rllib.agents import impala
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
# Whether to use V-trace weighted advantages. If false, PPO GAE advantages
# will be used instead.
"vtrace": False,
# == These two options only apply if vtrace: False ==
# If true, use the Generalized Advantage Estimator (GAE)
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
"use_gae": True,
# GAE(lambda) parameter
"lambda": 1.0,
# == PPO surrogate loss options ==
"clip_param": 0.4,
# == PPO KL Loss options ==
"use_kl_loss": False,
"kl_coeff": 1.0,
"kl_target": 0.01,
# == IMPALA optimizer params (see documentation in impala.py) ==
"sample_batch_size": 50,
"train_batch_size": 500,
"min_iter_time_s": 10,
"num_workers": 2,
"num_gpus": 0,
"num_data_loader_buffers": 1,
"minibatch_buffer_size": 1,
"num_sgd_iter": 1,
"replay_proportion": 0.0,
"replay_buffer_num_slots": 100,
"learner_queue_size": 16,
"learner_queue_timeout": 300,
"max_sample_requests_in_flight_per_worker": 2,
"broadcast_interval": 1,
"grad_clip": 40.0,
"opt_type": "adam",
"lr": 0.0005,
"lr_schedule": None,
"decay": 0.99,
"momentum": 0.0,
"epsilon": 0.1,
"vf_loss_coeff": 0.5,
"entropy_coeff": 0.01,
"entropy_coeff_schedule": None,
})
# __sphinx_doc_end__
# yapf: enable
def update_target_and_kl(trainer, fetches):
# Update the KL coeff depending on how many steps LearnerThread has stepped
# through
learner_steps = trainer.optimizer.learner.num_steps
if learner_steps >= trainer.target_update_frequency:
# Update Target Network
trainer.optimizer.learner.num_steps = 0
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.update_target())
# Also update KL Coeff
if trainer.config["use_kl_loss"]:
update_kl(trainer, trainer.optimizer.learner.stats)
def initialize_target(trainer):
trainer.workers.local_worker().foreach_trainable_policy(
lambda p, _: p.update_target())
trainer.target_update_frequency = trainer.config["num_sgd_iter"] \
* trainer.config["minibatch_buffer_size"]
APPOTrainer = impala.ImpalaTrainer.with_updates(
name="APPO",
default_config=DEFAULT_CONFIG,
default_policy=AsyncPPOTFPolicy,
get_policy_class=lambda _: AsyncPPOTFPolicy,
after_init=initialize_target,
after_optimizer_step=update_target_and_kl)
-440
View File
@@ -1,440 +0,0 @@
"""Adapted from VTraceTFPolicy to use the PPO surrogate loss.
Keep in sync with changes to VTraceTFPolicy."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import logging
import gym
from ray.rllib.agents.impala import vtrace
from ray.rllib.agents.impala.vtrace_policy import _make_time_major, \
BEHAVIOUR_LOGITS, clip_gradients, \
validate_config, choose_optimizer, ValueNetworkMixin
from ray.rllib.evaluation.postprocessing import Postprocessing
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.evaluation.postprocessing import compute_advantages
from ray.rllib.utils import try_import_tf
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.policy.tf_policy import LearningRateSchedule
from ray.rllib.agents.ppo.ppo_policy import KLCoeffMixin
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.explained_variance import explained_variance
tf = try_import_tf()
POLICY_SCOPE = "func"
TARGET_POLICY_SCOPE = "target_func"
logger = logging.getLogger(__name__)
class PPOSurrogateLoss(object):
"""Loss used when V-trace is disabled.
Arguments:
prev_actions_logp: A float32 tensor of shape [T, B].
actions_logp: A float32 tensor of shape [T, B].
action_kl: A float32 tensor of shape [T, B].
actions_entropy: A float32 tensor of shape [T, B].
values: A float32 tensor of shape [T, B].
valid_mask: A bool tensor of valid RNN input elements (#2992).
advantages: A float32 tensor of shape [T, B].
value_targets: A float32 tensor of shape [T, B].
vf_loss_coeff (float): Coefficient of the value function loss.
entropy_coeff (float): Coefficient of the entropy regularizer.
clip_param (float): Clip parameter.
cur_kl_coeff (float): Coefficient for KL loss.
use_kl_loss (bool): If true, use KL loss.
"""
def __init__(self,
prev_actions_logp,
actions_logp,
action_kl,
actions_entropy,
values,
valid_mask,
advantages,
value_targets,
vf_loss_coeff=0.5,
entropy_coeff=0.01,
clip_param=0.3,
cur_kl_coeff=None,
use_kl_loss=False):
def reduce_mean_valid(t):
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
logp_ratio = tf.exp(actions_logp - prev_actions_logp)
surrogate_loss = tf.minimum(
advantages * logp_ratio,
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
1 + clip_param))
self.mean_kl = reduce_mean_valid(action_kl)
self.pi_loss = -reduce_mean_valid(surrogate_loss)
# The baseline loss
delta = values - value_targets
self.value_targets = value_targets
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
# The entropy loss
self.entropy = reduce_mean_valid(actions_entropy)
# The summed weighted loss
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
self.entropy * entropy_coeff)
# Optional additional KL Loss
if use_kl_loss:
self.total_loss += cur_kl_coeff * self.mean_kl
class VTraceSurrogateLoss(object):
def __init__(self,
actions,
prev_actions_logp,
actions_logp,
old_policy_actions_logp,
action_kl,
actions_entropy,
dones,
behaviour_logits,
old_policy_behaviour_logits,
target_logits,
discount,
rewards,
values,
bootstrap_value,
dist_class,
valid_mask,
vf_loss_coeff=0.5,
entropy_coeff=0.01,
clip_rho_threshold=1.0,
clip_pg_rho_threshold=1.0,
clip_param=0.3,
cur_kl_coeff=None,
use_kl_loss=False):
"""APPO Loss, with IS modifications and V-trace for Advantage Estimation
VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
batch_size. The reason we need to know `B` is for V-trace to properly
handle episode cut boundaries.
Arguments:
actions: An int|float32 tensor of shape [T, B, logit_dim].
prev_actions_logp: A float32 tensor of shape [T, B].
actions_logp: A float32 tensor of shape [T, B].
old_policy_actions_logp: A float32 tensor of shape [T, B].
action_kl: A float32 tensor of shape [T, B].
actions_entropy: A float32 tensor of shape [T, B].
dones: A bool tensor of shape [T, B].
behaviour_logits: A float32 tensor of shape [T, B, logit_dim].
old_policy_behaviour_logits: A float32 tensor of shape
[T, B, logit_dim].
target_logits: A float32 tensor of shape [T, B, logit_dim].
discount: A float32 scalar.
rewards: A float32 tensor of shape [T, B].
values: A float32 tensor of shape [T, B].
bootstrap_value: A float32 tensor of shape [B].
dist_class: action distribution class for logits.
valid_mask: A bool tensor of valid RNN input elements (#2992).
vf_loss_coeff (float): Coefficient of the value function loss.
entropy_coeff (float): Coefficient of the entropy regularizer.
clip_param (float): Clip parameter.
cur_kl_coeff (float): Coefficient for KL loss.
use_kl_loss (bool): If true, use KL loss.
"""
def reduce_mean_valid(t):
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
# Compute vtrace on the CPU for better perf.
with tf.device("/cpu:0"):
self.vtrace_returns = vtrace.multi_from_logits(
behaviour_policy_logits=behaviour_logits,
target_policy_logits=old_policy_behaviour_logits,
actions=tf.unstack(actions, axis=2),
discounts=tf.to_float(~dones) * discount,
rewards=rewards,
values=values,
bootstrap_value=bootstrap_value,
dist_class=dist_class,
clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
tf.float32))
self.is_ratio = tf.clip_by_value(
tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp)
advantages = self.vtrace_returns.pg_advantages
surrogate_loss = tf.minimum(
advantages * logp_ratio,
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
1 + clip_param))
self.mean_kl = reduce_mean_valid(action_kl)
self.pi_loss = -reduce_mean_valid(surrogate_loss)
# The baseline loss
delta = values - self.vtrace_returns.vs
self.value_targets = self.vtrace_returns.vs
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
# The entropy loss
self.entropy = reduce_mean_valid(actions_entropy)
# The summed weighted loss
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
self.entropy * entropy_coeff)
# Optional additional KL Loss
if use_kl_loss:
self.total_loss += cur_kl_coeff * self.mean_kl
def build_appo_model(policy, obs_space, action_space, config):
policy.model = ModelCatalog.get_model_v2(
obs_space,
action_space,
policy.logit_dim,
config["model"],
name=POLICY_SCOPE,
framework="tf")
policy.target_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
policy.logit_dim,
config["model"],
name=TARGET_POLICY_SCOPE,
framework="tf")
return policy.model
def build_appo_surrogate_loss(policy, batch_tensors):
if isinstance(policy.action_space, gym.spaces.Discrete):
is_multidiscrete = False
output_hidden_shape = [policy.action_space.n]
elif isinstance(policy.action_space,
gym.spaces.multi_discrete.MultiDiscrete):
is_multidiscrete = True
output_hidden_shape = policy.action_space.nvec.astype(np.int32)
else:
is_multidiscrete = False
output_hidden_shape = 1
def make_time_major(*args, **kw):
return _make_time_major(policy, *args, **kw)
actions = batch_tensors[SampleBatch.ACTIONS]
dones = batch_tensors[SampleBatch.DONES]
rewards = batch_tensors[SampleBatch.REWARDS]
behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
policy.target_model_out, _ = policy.target_model(
policy.input_dict, policy.state_in, policy.seq_lens)
old_policy_behaviour_logits = tf.stop_gradient(policy.target_model_out)
unpacked_behaviour_logits = tf.split(
behaviour_logits, output_hidden_shape, axis=1)
unpacked_old_policy_behaviour_logits = tf.split(
old_policy_behaviour_logits, output_hidden_shape, axis=1)
unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1)
action_dist = policy.action_dist
old_policy_action_dist = policy.dist_class(old_policy_behaviour_logits)
prev_action_dist = policy.dist_class(behaviour_logits)
values = policy.value_function
policy.model_vars = policy.model.variables()
policy.target_model_vars = policy.target_model.variables()
if policy.state_in:
max_seq_len = tf.reduce_max(policy.seq_lens) - 1
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
mask = tf.reshape(mask, [-1])
else:
mask = tf.ones_like(rewards)
if policy.config["vtrace"]:
logger.info("Using V-Trace surrogate loss (vtrace=True)")
# Prepare actions for loss
loss_actions = actions if is_multidiscrete else tf.expand_dims(
actions, axis=1)
# Prepare KL for Loss
mean_kl = make_time_major(
old_policy_action_dist.multi_kl(action_dist), drop_last=True)
policy.loss = VTraceSurrogateLoss(
actions=make_time_major(loss_actions, drop_last=True),
prev_actions_logp=make_time_major(
prev_action_dist.logp(actions), drop_last=True),
actions_logp=make_time_major(
action_dist.logp(actions), drop_last=True),
old_policy_actions_logp=make_time_major(
old_policy_action_dist.logp(actions), drop_last=True),
action_kl=tf.reduce_mean(mean_kl, axis=0)
if is_multidiscrete else mean_kl,
actions_entropy=make_time_major(
action_dist.multi_entropy(), drop_last=True),
dones=make_time_major(dones, drop_last=True),
behaviour_logits=make_time_major(
unpacked_behaviour_logits, drop_last=True),
old_policy_behaviour_logits=make_time_major(
unpacked_old_policy_behaviour_logits, drop_last=True),
target_logits=make_time_major(unpacked_outputs, drop_last=True),
discount=policy.config["gamma"],
rewards=make_time_major(rewards, drop_last=True),
values=make_time_major(values, drop_last=True),
bootstrap_value=make_time_major(values)[-1],
dist_class=Categorical if is_multidiscrete else policy.dist_class,
valid_mask=make_time_major(mask, drop_last=True),
vf_loss_coeff=policy.config["vf_loss_coeff"],
entropy_coeff=policy.config["entropy_coeff"],
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
clip_pg_rho_threshold=policy.config[
"vtrace_clip_pg_rho_threshold"],
clip_param=policy.config["clip_param"],
cur_kl_coeff=policy.kl_coeff,
use_kl_loss=policy.config["use_kl_loss"])
else:
logger.info("Using PPO surrogate loss (vtrace=False)")
# Prepare KL for Loss
mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist))
policy.loss = PPOSurrogateLoss(
prev_actions_logp=make_time_major(prev_action_dist.logp(actions)),
actions_logp=make_time_major(action_dist.logp(actions)),
action_kl=tf.reduce_mean(mean_kl, axis=0)
if is_multidiscrete else mean_kl,
actions_entropy=make_time_major(action_dist.multi_entropy()),
values=make_time_major(values),
valid_mask=make_time_major(mask),
advantages=make_time_major(
batch_tensors[Postprocessing.ADVANTAGES]),
value_targets=make_time_major(
batch_tensors[Postprocessing.VALUE_TARGETS]),
vf_loss_coeff=policy.config["vf_loss_coeff"],
entropy_coeff=policy.config["entropy_coeff"],
clip_param=policy.config["clip_param"],
cur_kl_coeff=policy.kl_coeff,
use_kl_loss=policy.config["use_kl_loss"])
return policy.loss.total_loss
def stats(policy, batch_tensors):
values_batched = _make_time_major(
policy, policy.value_function, drop_last=policy.config["vtrace"])
stats_dict = {
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"policy_loss": policy.loss.pi_loss,
"entropy": policy.loss.entropy,
"var_gnorm": tf.global_norm(policy.var_list),
"vf_loss": policy.loss.vf_loss,
"vf_explained_var": explained_variance(
tf.reshape(policy.loss.value_targets, [-1]),
tf.reshape(values_batched, [-1])),
}
if policy.config["vtrace"]:
is_stat_mean, is_stat_var = tf.nn.moments(policy.loss.is_ratio, [0, 1])
stats_dict.update({"mean_IS": is_stat_mean})
stats_dict.update({"var_IS": is_stat_var})
if policy.config["use_kl_loss"]:
stats_dict.update({"kl": policy.loss.mean_kl})
stats_dict.update({"KL_Coeff": policy.kl_coeff})
return stats_dict
def postprocess_trajectory(policy,
sample_batch,
other_agent_batches=None,
episode=None):
if not policy.config["vtrace"]:
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
next_state = []
for i in range(len(policy.state_in)):
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
last_r = policy.value(sample_batch["new_obs"][-1], *next_state)
batch = compute_advantages(
sample_batch,
last_r,
policy.config["gamma"],
policy.config["lambda"],
use_gae=policy.config["use_gae"])
else:
batch = sample_batch
del batch.data["new_obs"] # not used, so save some bandwidth
return batch
def add_values_and_logits(policy):
out = {BEHAVIOUR_LOGITS: policy.model_out}
if not policy.config["vtrace"]:
out[SampleBatch.VF_PREDS] = policy.value_function
return out
class TargetNetworkMixin(object):
def __init__(self, obs_space, action_space, config):
"""Target Network is updated by the master learner every
trainer.update_target_frequency steps. All worker batches
are importance sampled w.r. to the target network to ensure
a more stable pi_old in PPO.
"""
assign_ops = []
assert len(self.model_vars) == len(self.target_model_vars)
for var, var_target in zip(self.model_vars, self.target_model_vars):
assign_ops.append(var_target.assign(var))
self.update_target_network = tf.group(*assign_ops)
def update_target(self):
return self.get_session().run(self.update_target_network)
def setup_mixins(policy, obs_space, action_space, config):
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
KLCoeffMixin.__init__(policy, config)
ValueNetworkMixin.__init__(policy)
def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
AsyncPPOTFPolicy = build_tf_policy(
name="AsyncPPOTFPolicy",
make_model=build_appo_model,
loss_fn=build_appo_surrogate_loss,
stats_fn=stats,
postprocess_fn=postprocess_trajectory,
optimizer_fn=choose_optimizer,
gradients_fn=clip_gradients,
extra_action_fetches_fn=add_values_and_logits,
before_init=validate_config,
before_loss_init=setup_mixins,
after_init=setup_late_mixins,
mixins=[
LearningRateSchedule, KLCoeffMixin, TargetNetworkMixin,
ValueNetworkMixin
],
get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
-154
View File
@@ -1,154 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
from ray.rllib.agents import with_common_config
from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
logger = logging.getLogger(__name__)
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# If true, use the Generalized Advantage Estimator (GAE)
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
"use_gae": True,
# GAE(lambda) parameter
"lambda": 1.0,
# Initial coefficient for KL divergence
"kl_coeff": 0.2,
# Size of batches collected from each worker
"sample_batch_size": 200,
# Number of timesteps collected for each SGD round
"train_batch_size": 4000,
# Total SGD batch size across all devices for SGD
"sgd_minibatch_size": 128,
# Whether to shuffle sequences in the batch when training (recommended)
"shuffle_sequences": True,
# Number of SGD iterations in each outer loop
"num_sgd_iter": 30,
# Stepsize of SGD
"lr": 5e-5,
# Learning rate schedule
"lr_schedule": None,
# Share layers for value function. If you set this to True, it's important
# to tune vf_loss_coeff.
"vf_share_layers": False,
# Coefficient of the value function loss. It's important to tune this if
# you set vf_share_layers: True
"vf_loss_coeff": 1.0,
# Coefficient of the entropy regularizer
"entropy_coeff": 0.0,
# Decay schedule for the entropy regularizer
"entropy_coeff_schedule": None,
# PPO clip parameter
"clip_param": 0.3,
# Clip param for the value function. Note that this is sensitive to the
# scale of the rewards. If your expected V is large, increase this.
"vf_clip_param": 10.0,
# If specified, clip the global norm of gradients by this amount
"grad_clip": None,
# Target value for KL divergence
"kl_target": 0.01,
# Whether to rollout "complete_episodes" or "truncate_episodes"
"batch_mode": "truncate_episodes",
# Which observation filter to apply to the observation
"observation_filter": "NoFilter",
# Uses the sync samples optimizer instead of the multi-gpu one. This does
# not support minibatches.
"simple_optimizer": False,
})
# __sphinx_doc_end__
# yapf: enable
def choose_policy_optimizer(workers, config):
if config["simple_optimizer"]:
return SyncSamplesOptimizer(
workers,
num_sgd_iter=config["num_sgd_iter"],
train_batch_size=config["train_batch_size"])
return LocalMultiGPUOptimizer(
workers,
sgd_batch_size=config["sgd_minibatch_size"],
num_sgd_iter=config["num_sgd_iter"],
num_gpus=config["num_gpus"],
sample_batch_size=config["sample_batch_size"],
num_envs_per_worker=config["num_envs_per_worker"],
train_batch_size=config["train_batch_size"],
standardize_fields=["advantages"],
shuffle_sequences=config["shuffle_sequences"])
def update_kl(trainer, fetches):
if "kl" in fetches:
# single-agent
trainer.workers.local_worker().for_policy(
lambda pi: pi.update_kl(fetches["kl"]))
else:
def update(pi, pi_id):
if pi_id in fetches:
pi.update_kl(fetches[pi_id]["kl"])
else:
logger.debug("No data for {}, not updating kl".format(pi_id))
# multi-agent
trainer.workers.local_worker().foreach_trainable_policy(update)
def warn_about_bad_reward_scales(trainer, result):
# Warn about bad clipping configs
if trainer.config["vf_clip_param"] <= 0:
rew_scale = float("inf")
elif result["policy_reward_mean"]:
rew_scale = 0 # punt on handling multiagent case
else:
rew_scale = round(
abs(result["episode_reward_mean"]) /
trainer.config["vf_clip_param"], 0)
if rew_scale > 200:
logger.warning(
"The magnitude of your environment rewards are more than "
"{}x the scale of `vf_clip_param`. ".format(rew_scale) +
"This means that it will take more than "
"{} iterations for your value ".format(rew_scale) +
"function to converge. If this is not intended, consider "
"increasing `vf_clip_param`.")
def validate_config(config):
if config["entropy_coeff"] < 0:
raise DeprecationWarning("entropy_coeff must be >= 0")
if config["sgd_minibatch_size"] > config["train_batch_size"]:
raise ValueError(
"Minibatch size {} must be <= train batch size {}.".format(
config["sgd_minibatch_size"], config["train_batch_size"]))
if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
raise ValueError(
"Episode truncation is not supported without a value "
"function. Consider setting batch_mode=complete_episodes.")
if config["multiagent"]["policies"] and not config["simple_optimizer"]:
logger.info(
"In multi-agent mode, policies will be optimized sequentially "
"by the multi-GPU optimizer. Consider setting "
"simple_optimizer=True if this doesn't work for you.")
if config["simple_optimizer"]:
logger.warning(
"Using the simple non-minibatch optimizer. This will greatly "
"reduce performance, consider simple_optimizer=False.")
PPOTrainer = build_trainer(
name="PPO",
default_config=DEFAULT_CONFIG,
default_policy=PPOTFPolicy,
make_policy_optimizer=choose_policy_optimizer,
validate_config=validate_config,
after_optimizer_step=update_kl,
after_train_result=warn_about_bad_reward_scales)
-270
View File
@@ -1,270 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import ray
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
EntropyCoeffSchedule
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
logger = logging.getLogger(__name__)
# Frozen logits of the policy that computed the action
BEHAVIOUR_LOGITS = "behaviour_logits"
class PPOLoss(object):
def __init__(self,
action_space,
value_targets,
advantages,
actions,
logits,
vf_preds,
curr_action_dist,
value_fn,
cur_kl_coeff,
valid_mask,
entropy_coeff=0,
clip_param=0.1,
vf_clip_param=0.1,
vf_loss_coeff=1.0,
use_gae=True):
"""Constructs the loss for Proximal Policy Objective.
Arguments:
action_space: Environment observation space specification.
value_targets (Placeholder): Placeholder for target values; used
for GAE.
actions (Placeholder): Placeholder for actions taken
from previous model evaluation.
advantages (Placeholder): Placeholder for calculated advantages
from previous model evaluation.
logits (Placeholder): Placeholder for logits output from
previous model evaluation.
vf_preds (Placeholder): Placeholder for value function output
from previous model evaluation.
curr_action_dist (ActionDistribution): ActionDistribution
of the current model.
value_fn (Tensor): Current value function output Tensor.
cur_kl_coeff (Variable): Variable holding the current PPO KL
coefficient.
valid_mask (Tensor): A bool mask of valid input elements (#2992).
entropy_coeff (float): Coefficient of the entropy regularizer.
clip_param (float): Clip parameter
vf_clip_param (float): Clip parameter for the value function
vf_loss_coeff (float): Coefficient of the value function loss
use_gae (bool): If true, use the Generalized Advantage Estimator.
"""
def reduce_mean_valid(t):
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
prev_dist = dist_cls(logits)
# Make loss functions.
logp_ratio = tf.exp(
curr_action_dist.logp(actions) - prev_dist.logp(actions))
action_kl = prev_dist.kl(curr_action_dist)
self.mean_kl = reduce_mean_valid(action_kl)
curr_entropy = curr_action_dist.entropy()
self.mean_entropy = reduce_mean_valid(curr_entropy)
surrogate_loss = tf.minimum(
advantages * logp_ratio,
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
1 + clip_param))
self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
if use_gae:
vf_loss1 = tf.square(value_fn - value_targets)
vf_clipped = vf_preds + tf.clip_by_value(
value_fn - vf_preds, -vf_clip_param, vf_clip_param)
vf_loss2 = tf.square(vf_clipped - value_targets)
vf_loss = tf.maximum(vf_loss1, vf_loss2)
self.mean_vf_loss = reduce_mean_valid(vf_loss)
loss = reduce_mean_valid(
-surrogate_loss + cur_kl_coeff * action_kl +
vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy)
else:
self.mean_vf_loss = tf.constant(0.0)
loss = reduce_mean_valid(-surrogate_loss +
cur_kl_coeff * action_kl -
entropy_coeff * curr_entropy)
self.loss = loss
def ppo_surrogate_loss(policy, batch_tensors):
if policy.state_in:
max_seq_len = tf.reduce_max(policy.seq_lens)
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
mask = tf.reshape(mask, [-1])
else:
mask = tf.ones_like(
batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool)
policy.loss_obj = PPOLoss(
policy.action_space,
batch_tensors[Postprocessing.VALUE_TARGETS],
batch_tensors[Postprocessing.ADVANTAGES],
batch_tensors[SampleBatch.ACTIONS],
batch_tensors[BEHAVIOUR_LOGITS],
batch_tensors[SampleBatch.VF_PREDS],
policy.action_dist,
policy.value_function,
policy.kl_coeff,
mask,
entropy_coeff=policy.entropy_coeff,
clip_param=policy.config["clip_param"],
vf_clip_param=policy.config["vf_clip_param"],
vf_loss_coeff=policy.config["vf_loss_coeff"],
use_gae=policy.config["use_gae"])
return policy.loss_obj.loss
def kl_and_loss_stats(policy, batch_tensors):
return {
"cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
"total_loss": policy.loss_obj.loss,
"policy_loss": policy.loss_obj.mean_policy_loss,
"vf_loss": policy.loss_obj.mean_vf_loss,
"vf_explained_var": explained_variance(
batch_tensors[Postprocessing.VALUE_TARGETS],
policy.value_function),
"kl": policy.loss_obj.mean_kl,
"entropy": policy.loss_obj.mean_entropy,
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
}
def vf_preds_and_logits_fetches(policy):
"""Adds value function and logits outputs to experience batches."""
return {
SampleBatch.VF_PREDS: policy.value_function,
BEHAVIOUR_LOGITS: policy.model_out,
}
def postprocess_ppo_gae(policy,
sample_batch,
other_agent_batches=None,
episode=None):
"""Adds the policy logits, VF preds, and advantages to the trajectory."""
completed = sample_batch["dones"][-1]
if completed:
last_r = 0.0
else:
next_state = []
for i in range(len(policy.state_in)):
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
sample_batch[SampleBatch.ACTIONS][-1],
sample_batch[SampleBatch.REWARDS][-1],
*next_state)
batch = compute_advantages(
sample_batch,
last_r,
policy.config["gamma"],
policy.config["lambda"],
use_gae=policy.config["use_gae"])
return batch
def clip_gradients(policy, optimizer, loss):
if policy.config["grad_clip"] is not None:
policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
tf.get_variable_scope().name)
grads = tf.gradients(loss, policy.var_list)
policy.grads, _ = tf.clip_by_global_norm(grads,
policy.config["grad_clip"])
clipped_grads = list(zip(policy.grads, policy.var_list))
return clipped_grads
else:
return optimizer.compute_gradients(
loss, colocate_gradients_with_ops=True)
class KLCoeffMixin(object):
def __init__(self, config):
# KL Coefficient
self.kl_coeff_val = config["kl_coeff"]
self.kl_target = config["kl_target"]
self.kl_coeff = tf.get_variable(
initializer=tf.constant_initializer(self.kl_coeff_val),
name="kl_coeff",
shape=(),
trainable=False,
dtype=tf.float32)
def update_kl(self, sampled_kl):
if sampled_kl > 2.0 * self.kl_target:
self.kl_coeff_val *= 1.5
elif sampled_kl < 0.5 * self.kl_target:
self.kl_coeff_val *= 0.5
self.kl_coeff.load(self.kl_coeff_val, session=self.get_session())
return self.kl_coeff_val
class ValueNetworkMixin(object):
def __init__(self, obs_space, action_space, config):
if config["use_gae"]:
self.value_function = self.model.value_function()
else:
self.value_function = tf.zeros(
shape=tf.shape(self.get_placeholder(SampleBatch.CUR_OBS))[:1])
def _value(self, ob, prev_action, prev_reward, *args):
feed_dict = {
self.get_placeholder(SampleBatch.CUR_OBS): [ob],
self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action],
self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward],
self.seq_lens: [1]
}
assert len(args) == len(self.state_in), (args, self.state_in)
for k, v in zip(self.state_in, args):
feed_dict[k] = v
vf = self.get_session().run(self.value_function, feed_dict)
return vf[0]
def setup_config(policy, obs_space, action_space, config):
# auto set the model option for layer sharing
config["model"]["vf_share_layers"] = config["vf_share_layers"]
def setup_mixins(policy, obs_space, action_space, config):
ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
KLCoeffMixin.__init__(policy, config)
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
config["entropy_coeff_schedule"])
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
PPOTFPolicy = build_tf_policy(
name="PPOTFPolicy",
get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
loss_fn=ppo_surrogate_loss,
stats_fn=kl_and_loss_stats,
extra_action_fetches_fn=vf_preds_and_logits_fetches,
postprocess_fn=postprocess_ppo_gae,
gradients_fn=clip_gradients,
before_init=setup_config,
before_loss_init=setup_mixins,
mixins=[
LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
ValueNetworkMixin
])
-64
View File
@@ -1,64 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import numpy as np
from numpy.testing import assert_allclose
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.agents.ppo.utils import flatten, concatenate
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
# TODO(ekl): move to rllib/models dir
class DistributionsTest(unittest.TestCase):
def testCategorical(self):
num_samples = 100000
logits = tf.placeholder(tf.float32, shape=(None, 10))
z = 8 * (np.random.rand(10) - 0.5)
data = np.tile(z, (num_samples, 1))
c = Categorical(logits)
sample_op = c.sample()
sess = tf.Session()
sess.run(tf.global_variables_initializer())
samples = sess.run(sample_op, feed_dict={logits: data})
counts = np.zeros(10)
for sample in samples:
counts[sample] += 1.0
probs = np.exp(z) / np.sum(np.exp(z))
self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
class UtilsTest(unittest.TestCase):
def testFlatten(self):
d = {
"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
"a": np.array([[[5], [-5]], [[6], [-6]]])
}
flat = flatten(d.copy(), start=0, stop=2)
assert_allclose(d["s"][0][0][:], flat["s"][0][:])
assert_allclose(d["s"][0][1][:], flat["s"][1][:])
assert_allclose(d["s"][1][0][:], flat["s"][2][:])
assert_allclose(d["s"][1][1][:], flat["s"][3][:])
assert_allclose(d["a"][0][0], flat["a"][0])
assert_allclose(d["a"][0][1], flat["a"][1])
assert_allclose(d["a"][1][0], flat["a"][2])
assert_allclose(d["a"][1][1], flat["a"][3])
def testConcatenate(self):
d1 = {"s": np.array([0, 1]), "a": np.array([2, 3])}
d2 = {"s": np.array([4, 5]), "a": np.array([6, 7])}
d = concatenate([d1, d2])
assert_allclose(d["s"], np.array([0, 1, 4, 5]))
assert_allclose(d["a"], np.array([2, 3, 6, 7]))
D = concatenate([d])
assert_allclose(D["s"], np.array([0, 1, 4, 5]))
assert_allclose(D["a"], np.array([2, 3, 6, 7]))
if __name__ == "__main__":
unittest.main(verbosity=2)
-36
View File
@@ -1,36 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
def flatten(weights, start=0, stop=2):
"""This methods reshapes all values in a dictionary.
The indices from start to stop will be flattened into a single index.
Args:
weights: A dictionary mapping keys to numpy arrays.
start: The starting index.
stop: The ending index.
"""
for key, val in weights.items():
new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
weights[key] = val.reshape(new_shape)
return weights
def concatenate(weights_list):
keys = weights_list[0].keys()
result = {}
for key in keys:
result[key] = np.concatenate([l[key] for l in weights_list])
return result
def shuffle(trajectory):
permutation = np.random.permutation(trajectory["actions"].shape[0])
for key, val in trajectory.items():
trajectory[key] = val[permutation]
return trajectory
-1
View File
@@ -1 +0,0 @@
Code in this package is adapted from https://github.com/oxwhirl/pymarl.
-8
View File
@@ -1,8 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.qmix.qmix import QMixTrainer, DEFAULT_CONFIG
from ray.rllib.agents.qmix.apex import ApexQMixTrainer
__all__ = ["QMixTrainer", "ApexQMixTrainer", "DEFAULT_CONFIG"]
-39
View File
@@ -1,39 +0,0 @@
"""Experimental: scalable Ape-X variant of QMIX"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
from ray.rllib.agents.qmix.qmix import QMixTrainer, \
DEFAULT_CONFIG as QMIX_CONFIG
from ray.rllib.utils import merge_dicts
APEX_QMIX_DEFAULT_CONFIG = merge_dicts(
QMIX_CONFIG, # see also the options in qmix.py, which are also supported
{
"optimizer": merge_dicts(
QMIX_CONFIG["optimizer"],
{
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"batch_replay": True, # required for RNN. Disables prio.
"debug": False
}),
"num_gpus": 0,
"num_workers": 32,
"buffer_size": 2000000,
"learning_starts": 50000,
"train_batch_size": 512,
"sample_batch_size": 50,
"target_network_update_freq": 500000,
"timesteps_per_iteration": 25000,
"per_worker_exploration": True,
"min_iter_time_s": 30,
},
)
ApexQMixTrainer = QMixTrainer.with_updates(
name="APEX_QMIX",
default_config=APEX_QMIX_DEFAULT_CONFIG,
**APEX_TRAINER_PROPERTIES)
-64
View File
@@ -1,64 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import torch as th
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class VDNMixer(nn.Module):
def __init__(self):
super(VDNMixer, self).__init__()
def forward(self, agent_qs, batch):
return th.sum(agent_qs, dim=2, keepdim=True)
class QMixer(nn.Module):
def __init__(self, n_agents, state_shape, mixing_embed_dim):
super(QMixer, self).__init__()
self.n_agents = n_agents
self.embed_dim = mixing_embed_dim
self.state_dim = int(np.prod(state_shape))
self.hyper_w_1 = nn.Linear(self.state_dim,
self.embed_dim * self.n_agents)
self.hyper_w_final = nn.Linear(self.state_dim, self.embed_dim)
# State dependent bias for hidden layer
self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim)
# V(s) instead of a bias for the last layers
self.V = nn.Sequential(
nn.Linear(self.state_dim, self.embed_dim), nn.ReLU(),
nn.Linear(self.embed_dim, 1))
def forward(self, agent_qs, states):
"""Forward pass for the mixer.
Arguments:
agent_qs: Tensor of shape [B, T, n_agents, n_actions]
states: Tensor of shape [B, T, state_dim]
"""
bs = agent_qs.size(0)
states = states.reshape(-1, self.state_dim)
agent_qs = agent_qs.view(-1, 1, self.n_agents)
# First layer
w1 = th.abs(self.hyper_w_1(states))
b1 = self.hyper_b_1(states)
w1 = w1.view(-1, self.n_agents, self.embed_dim)
b1 = b1.view(-1, 1, self.embed_dim)
hidden = F.elu(th.bmm(agent_qs, w1) + b1)
# Second layer
w_final = th.abs(self.hyper_w_final(states))
w_final = w_final.view(-1, self.embed_dim, 1)
# State-dependent bias
v = self.V(states).view(-1, 1, 1)
# Compute final output
y = th.bmm(hidden, w_final) + v
# Reshape and return
q_tot = y.view(bs, -1, 1)
return q_tot
-42
View File
@@ -1,42 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from torch import nn
import torch.nn.functional as F
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.utils.annotations import override
class RNNModel(TorchModelV2, nn.Module):
"""The default RNN model for QMIX."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
model_config, name)
nn.Module.__init__(self)
self.obs_size = _get_size(obs_space)
self.rnn_hidden_dim = model_config["lstm_cell_size"]
self.fc1 = nn.Linear(self.obs_size, self.rnn_hidden_dim)
self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim)
self.fc2 = nn.Linear(self.rnn_hidden_dim, num_outputs)
@override(TorchModelV2)
def get_initial_state(self):
# make hidden states on same device as model
return [self.fc1.weight.new(1, self.rnn_hidden_dim).zero_().squeeze(0)]
@override(TorchModelV2)
def forward(self, input_dict, hidden_state, seq_lens):
x = F.relu(self.fc1(input_dict["obs_flat"].float()))
h_in = hidden_state[0].reshape(-1, self.rnn_hidden_dim)
h = self.rnn(x, h_in)
q = self.fc2(h)
return q, [h]
def _get_size(obs_space):
return get_preprocessor(obs_space)(obs_space).size
-104
View File
@@ -1,104 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.qmix.qmix_policy import QMixTorchPolicy
from ray.rllib.optimizers import SyncBatchReplayOptimizer
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# === QMix ===
# Mixing network. Either "qmix", "vdn", or None
"mixer": "qmix",
# Size of the mixing network embedding
"mixing_embed_dim": 32,
# Whether to use Double_Q learning
"double_q": True,
# Optimize over complete episodes by default.
"batch_mode": "complete_episodes",
# === Evaluation ===
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that evaluation is currently not parallelized, and that for Ape-X
# metrics are already only reported for the lowest epsilon workers.
"evaluation_interval": None,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 10,
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
# exploration_fraction
"schedule_max_timesteps": 100000,
# Number of env steps to optimize for before returning
"timesteps_per_iteration": 1000,
# Fraction of entire training period over which the exploration rate is
# annealed
"exploration_fraction": 0.1,
# Final value of random action probability
"exploration_final_eps": 0.02,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 500,
# === Replay buffer ===
# Size of the replay buffer in steps.
"buffer_size": 10000,
# === Optimization ===
# Learning rate for adam optimizer
"lr": 0.0005,
# RMSProp alpha
"optim_alpha": 0.99,
# RMSProp epsilon
"optim_eps": 0.00001,
# If not None, clip gradients during optimization at this value
"grad_norm_clipping": 10,
# How many steps of the model to sample before learning starts.
"learning_starts": 1000,
# Update the replay buffer with this many samples at once. Note that
# this setting applies per-worker if num_workers > 1.
"sample_batch_size": 4,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
"train_batch_size": 32,
# === Parallelism ===
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to use a distribution of epsilons across workers for exploration.
"per_worker_exploration": False,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
# === Model ===
"model": {
"lstm_cell_size": 64,
"max_seq_len": 999999,
},
})
# __sphinx_doc_end__
# yapf: enable
def make_sync_batch_optimizer(workers, config):
return SyncBatchReplayOptimizer(
workers,
learning_starts=config["learning_starts"],
buffer_size=config["buffer_size"],
train_batch_size=config["train_batch_size"])
QMixTrainer = GenericOffPolicyTrainer.with_updates(
name="QMIX",
default_config=DEFAULT_CONFIG,
default_policy=QMixTorchPolicy,
make_policy_optimizer=make_sync_batch_optimizer)
-450
View File
@@ -1,450 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Tuple, Discrete, Dict
import logging
import numpy as np
import torch as th
import torch.nn as nn
from torch.optim import RMSprop
from torch.distributions import Categorical
import ray
from ray.rllib.agents.qmix.mixers import VDNMixer, QMixer
from ray.rllib.agents.qmix.model import RNNModel, _get_size
from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
from ray.rllib.policy.policy import Policy, TupleActions
from ray.rllib.policy.rnn_sequencing import chop_into_sequences
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.models.catalog import ModelCatalog
from ray.rllib.models.model import _unpack_obs
from ray.rllib.env.constants import GROUP_REWARDS
from ray.rllib.utils.annotations import override
logger = logging.getLogger(__name__)
class QMixLoss(nn.Module):
def __init__(self,
model,
target_model,
mixer,
target_mixer,
n_agents,
n_actions,
double_q=True,
gamma=0.99):
nn.Module.__init__(self)
self.model = model
self.target_model = target_model
self.mixer = mixer
self.target_mixer = target_mixer
self.n_agents = n_agents
self.n_actions = n_actions
self.double_q = double_q
self.gamma = gamma
def forward(self, rewards, actions, terminated, mask, obs, next_obs,
action_mask, next_action_mask):
"""Forward pass of the loss.
Arguments:
rewards: Tensor of shape [B, T, n_agents]
actions: Tensor of shape [B, T, n_agents]
terminated: Tensor of shape [B, T, n_agents]
mask: Tensor of shape [B, T, n_agents]
obs: Tensor of shape [B, T, n_agents, obs_size]
next_obs: Tensor of shape [B, T, n_agents, obs_size]
action_mask: Tensor of shape [B, T, n_agents, n_actions]
next_action_mask: Tensor of shape [B, T, n_agents, n_actions]
"""
B, T = obs.size(0), obs.size(1)
# Calculate estimated Q-Values
mac_out = []
h = [
s.expand([B, self.n_agents, -1])
for s in self.model.get_initial_state()
]
for t in range(T):
q, h = _mac(self.model, obs[:, t], h)
mac_out.append(q)
mac_out = th.stack(mac_out, dim=1) # Concat over time
# Pick the Q-Values for the actions taken -> [B * n_agents, T]
chosen_action_qvals = th.gather(
mac_out, dim=3, index=actions.unsqueeze(3)).squeeze(3)
# Calculate the Q-Values necessary for the target
target_mac_out = []
target_h = [
s.expand([B, self.n_agents, -1])
for s in self.target_model.get_initial_state()
]
for t in range(T):
target_q, target_h = _mac(self.target_model, next_obs[:, t],
target_h)
target_mac_out.append(target_q)
target_mac_out = th.stack(target_mac_out, dim=1) # Concat across time
# Mask out unavailable actions
ignore_action = (next_action_mask == 0) & (mask == 1).unsqueeze(-1)
target_mac_out[ignore_action] = -np.inf
# Max over target Q-Values
if self.double_q:
# Get actions that maximise live Q (for double q-learning)
ignore_action = (action_mask == 0) & (mask == 1).unsqueeze(-1)
mac_out = mac_out.clone() # issue 4742
mac_out[ignore_action] = -np.inf
cur_max_actions = mac_out.max(dim=3, keepdim=True)[1]
target_max_qvals = th.gather(target_mac_out, 3,
cur_max_actions).squeeze(3)
else:
target_max_qvals = target_mac_out.max(dim=3)[0]
assert target_max_qvals.min().item() != -np.inf, \
"target_max_qvals contains a masked action; \
there may be a state with no valid actions."
# Mix
if self.mixer is not None:
# TODO(ekl) add support for handling global state? This is just
# treating the stacked agent obs as the state.
chosen_action_qvals = self.mixer(chosen_action_qvals, obs)
target_max_qvals = self.target_mixer(target_max_qvals, next_obs)
# Calculate 1-step Q-Learning targets
targets = rewards + self.gamma * (1 - terminated) * target_max_qvals
# Td-error
td_error = (chosen_action_qvals - targets.detach())
mask = mask.expand_as(td_error)
# 0-out the targets that came from padded data
masked_td_error = td_error * mask
# Normal L2 loss, take mean over actual data
loss = (masked_td_error**2).sum() / mask.sum()
return loss, mask, masked_td_error, chosen_action_qvals, targets
class QMixTorchPolicy(Policy):
"""QMix impl. Assumes homogeneous agents for now.
You must use MultiAgentEnv.with_agent_groups() to group agents
together for QMix. This creates the proper Tuple obs/action spaces and
populates the '_group_rewards' info field.
Action masking: to specify an action mask for individual agents, use a
dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}.
The mask space must be `Box(0, 1, (n_actions,))`.
"""
def __init__(self, obs_space, action_space, config):
_validate(obs_space, action_space)
config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config)
self.config = config
self.observation_space = obs_space
self.action_space = action_space
self.n_agents = len(obs_space.original_space.spaces)
self.n_actions = action_space.spaces[0].n
self.h_size = config["model"]["lstm_cell_size"]
agent_obs_space = obs_space.original_space.spaces[0]
if isinstance(agent_obs_space, Dict):
space_keys = set(agent_obs_space.spaces.keys())
if space_keys != {"obs", "action_mask"}:
raise ValueError(
"Dict obs space for agent must have keyset "
"['obs', 'action_mask'], got {}".format(space_keys))
mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape)
if mask_shape != (self.n_actions, ):
raise ValueError("Action mask shape must be {}, got {}".format(
(self.n_actions, ), mask_shape))
self.has_action_mask = True
self.obs_size = _get_size(agent_obs_space.spaces["obs"])
# The real agent obs space is nested inside the dict
agent_obs_space = agent_obs_space.spaces["obs"]
else:
self.has_action_mask = False
self.obs_size = _get_size(agent_obs_space)
self.model = ModelCatalog.get_model_v2(
agent_obs_space,
action_space.spaces[0],
self.n_actions,
config["model"],
framework="torch",
name="model",
default_model=RNNModel)
self.target_model = ModelCatalog.get_model_v2(
agent_obs_space,
action_space.spaces[0],
self.n_actions,
config["model"],
framework="torch",
name="target_model",
default_model=RNNModel)
# Setup the mixer network.
# The global state is just the stacked agent observations for now.
self.state_shape = [self.obs_size, self.n_agents]
if config["mixer"] is None:
self.mixer = None
self.target_mixer = None
elif config["mixer"] == "qmix":
self.mixer = QMixer(self.n_agents, self.state_shape,
config["mixing_embed_dim"])
self.target_mixer = QMixer(self.n_agents, self.state_shape,
config["mixing_embed_dim"])
elif config["mixer"] == "vdn":
self.mixer = VDNMixer()
self.target_mixer = VDNMixer()
else:
raise ValueError("Unknown mixer type {}".format(config["mixer"]))
self.cur_epsilon = 1.0
self.update_target() # initial sync
# Setup optimizer
self.params = list(self.model.parameters())
if self.mixer:
self.params += list(self.mixer.parameters())
self.loss = QMixLoss(self.model, self.target_model, self.mixer,
self.target_mixer, self.n_agents, self.n_actions,
self.config["double_q"], self.config["gamma"])
self.optimiser = RMSprop(
params=self.params,
lr=config["lr"],
alpha=config["optim_alpha"],
eps=config["optim_eps"])
@override(Policy)
def compute_actions(self,
obs_batch,
state_batches=None,
prev_action_batch=None,
prev_reward_batch=None,
info_batch=None,
episodes=None,
**kwargs):
obs_batch, action_mask = self._unpack_observation(obs_batch)
# Compute actions
with th.no_grad():
q_values, hiddens = _mac(
self.model, th.from_numpy(obs_batch),
[th.from_numpy(np.array(s)) for s in state_batches])
avail = th.from_numpy(action_mask).float()
masked_q_values = q_values.clone()
masked_q_values[avail == 0.0] = -float("inf")
# epsilon-greedy action selector
random_numbers = th.rand_like(q_values[:, :, 0])
pick_random = (random_numbers < self.cur_epsilon).long()
random_actions = Categorical(avail).sample().long()
actions = (pick_random * random_actions +
(1 - pick_random) * masked_q_values.max(dim=2)[1])
actions = actions.numpy()
hiddens = [s.numpy() for s in hiddens]
return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
@override(Policy)
def learn_on_batch(self, samples):
obs_batch, action_mask = self._unpack_observation(
samples[SampleBatch.CUR_OBS])
next_obs_batch, next_action_mask = self._unpack_observation(
samples[SampleBatch.NEXT_OBS])
group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS])
# These will be padded to shape [B * T, ...]
[rew, action_mask, next_action_mask, act, dones, obs, next_obs], \
initial_states, seq_lens = \
chop_into_sequences(
samples[SampleBatch.EPS_ID],
samples[SampleBatch.UNROLL_ID],
samples[SampleBatch.AGENT_INDEX], [
group_rewards, action_mask, next_action_mask,
samples[SampleBatch.ACTIONS], samples[SampleBatch.DONES],
obs_batch, next_obs_batch
],
[samples["state_in_{}".format(k)]
for k in range(len(self.get_initial_state()))],
max_seq_len=self.config["model"]["max_seq_len"],
dynamic_max=True)
B, T = len(seq_lens), max(seq_lens)
def to_batches(arr):
new_shape = [B, T] + list(arr.shape[1:])
return th.from_numpy(np.reshape(arr, new_shape))
rewards = to_batches(rew).float()
actions = to_batches(act).long()
obs = to_batches(obs).reshape([B, T, self.n_agents,
self.obs_size]).float()
action_mask = to_batches(action_mask)
next_obs = to_batches(next_obs).reshape(
[B, T, self.n_agents, self.obs_size]).float()
next_action_mask = to_batches(next_action_mask)
# TODO(ekl) this treats group termination as individual termination
terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand(
B, T, self.n_agents)
# Create mask for where index is < unpadded sequence length
filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) <
np.expand_dims(seq_lens, 1)).astype(np.float32)
mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)
# Compute loss
loss_out, mask, masked_td_error, chosen_action_qvals, targets = \
self.loss(rewards, actions, terminated, mask, obs,
next_obs, action_mask, next_action_mask)
# Optimise
self.optimiser.zero_grad()
loss_out.backward()
grad_norm = th.nn.utils.clip_grad_norm_(
self.params, self.config["grad_norm_clipping"])
self.optimiser.step()
mask_elems = mask.sum().item()
stats = {
"loss": loss_out.item(),
"grad_norm": grad_norm
if isinstance(grad_norm, float) else grad_norm.item(),
"td_error_abs": masked_td_error.abs().sum().item() / mask_elems,
"q_taken_mean": (chosen_action_qvals * mask).sum().item() /
mask_elems,
"target_mean": (targets * mask).sum().item() / mask_elems,
}
return {LEARNER_STATS_KEY: stats}
@override(Policy)
def get_initial_state(self):
return [
s.expand([self.n_agents, -1]).numpy()
for s in self.model.get_initial_state()
]
@override(Policy)
def get_weights(self):
return {"model": self.model.state_dict()}
@override(Policy)
def set_weights(self, weights):
self.model.load_state_dict(weights["model"])
@override(Policy)
def get_state(self):
return {
"model": self.model.state_dict(),
"target_model": self.target_model.state_dict(),
"mixer": self.mixer.state_dict() if self.mixer else None,
"target_mixer": self.target_mixer.state_dict()
if self.mixer else None,
"cur_epsilon": self.cur_epsilon,
}
@override(Policy)
def set_state(self, state):
self.model.load_state_dict(state["model"])
self.target_model.load_state_dict(state["target_model"])
if state["mixer"] is not None:
self.mixer.load_state_dict(state["mixer"])
self.target_mixer.load_state_dict(state["target_mixer"])
self.set_epsilon(state["cur_epsilon"])
self.update_target()
def update_target(self):
self.target_model.load_state_dict(self.model.state_dict())
if self.mixer is not None:
self.target_mixer.load_state_dict(self.mixer.state_dict())
logger.debug("Updated target networks")
def set_epsilon(self, epsilon):
self.cur_epsilon = epsilon
def _get_group_rewards(self, info_batch):
group_rewards = np.array([
info.get(GROUP_REWARDS, [0.0] * self.n_agents)
for info in info_batch
])
return group_rewards
def _unpack_observation(self, obs_batch):
"""Unpacks the action mask / tuple obs from agent grouping.
Returns:
obs (Tensor): flattened obs tensor of shape [B, n_agents, obs_size]
mask (Tensor): action mask, if any
"""
unpacked = _unpack_obs(
np.array(obs_batch),
self.observation_space.original_space,
tensorlib=np)
if self.has_action_mask:
obs = np.concatenate(
[o["obs"] for o in unpacked],
axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size])
action_mask = np.concatenate(
[o["action_mask"] for o in unpacked], axis=1).reshape(
[len(obs_batch), self.n_agents, self.n_actions])
else:
obs = np.concatenate(
unpacked,
axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size])
action_mask = np.ones(
[len(obs_batch), self.n_agents, self.n_actions])
return obs, action_mask
def _validate(obs_space, action_space):
if not hasattr(obs_space, "original_space") or \
not isinstance(obs_space.original_space, Tuple):
raise ValueError("Obs space must be a Tuple, got {}. Use ".format(
obs_space) + "MultiAgentEnv.with_agent_groups() to group related "
"agents for QMix.")
if not isinstance(action_space, Tuple):
raise ValueError(
"Action space must be a Tuple, got {}. ".format(action_space) +
"Use MultiAgentEnv.with_agent_groups() to group related "
"agents for QMix.")
if not isinstance(action_space.spaces[0], Discrete):
raise ValueError(
"QMix requires a discrete action space, got {}".format(
action_space.spaces[0]))
if len({str(x) for x in obs_space.original_space.spaces}) > 1:
raise ValueError(
"Implementation limitation: observations of grouped agents "
"must be homogeneous, got {}".format(
obs_space.original_space.spaces))
if len({str(x) for x in action_space.spaces}) > 1:
raise ValueError(
"Implementation limitation: action space of grouped agents "
"must be homogeneous, got {}".format(action_space.spaces))
def _mac(model, obs, h):
"""Forward pass of the multi-agent controller.
Arguments:
model: TorchModelV2 class
obs: Tensor of shape [B, n_agents, obs_size]
h: List of tensors of shape [B, n_agents, h_size]
Returns:
q_vals: Tensor of shape [B, n_agents, n_actions]
h: Tensor of shape [B, n_agents, h_size]
"""
B, n_agents = obs.size(0), obs.size(1)
obs_flat = obs.reshape([B * n_agents, -1])
h_flat = [s.reshape([B * n_agents, -1]) for s in h]
q_flat, h_flat = model({"obs": obs_flat}, h_flat, None)
return q_flat.reshape(
[B, n_agents, -1]), [s.reshape([B, n_agents, -1]) for s in h_flat]
-152
View File
@@ -1,152 +0,0 @@
"""Registry of algorithm names for `rllib train --run=<alg_name>`"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import traceback
from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS
def _import_sac():
from ray.rllib.agents import sac
return sac.SACTrainer
def _import_appo():
from ray.rllib.agents import ppo
return ppo.APPOTrainer
def _import_qmix():
from ray.rllib.agents import qmix
return qmix.QMixTrainer
def _import_apex_qmix():
from ray.rllib.agents import qmix
return qmix.ApexQMixTrainer
def _import_ddpg():
from ray.rllib.agents import ddpg
return ddpg.DDPGTrainer
def _import_apex_ddpg():
from ray.rllib.agents import ddpg
return ddpg.ApexDDPGTrainer
def _import_td3():
from ray.rllib.agents import ddpg
return ddpg.TD3Trainer
def _import_ppo():
from ray.rllib.agents import ppo
return ppo.PPOTrainer
def _import_es():
from ray.rllib.agents import es
return es.ESTrainer
def _import_ars():
from ray.rllib.agents import ars
return ars.ARSTrainer
def _import_dqn():
from ray.rllib.agents import dqn
return dqn.DQNTrainer
def _import_simple_q():
from ray.rllib.agents import dqn
return dqn.SimpleQTrainer
def _import_apex():
from ray.rllib.agents import dqn
return dqn.ApexTrainer
def _import_a3c():
from ray.rllib.agents import a3c
return a3c.A3CTrainer
def _import_a2c():
from ray.rllib.agents import a3c
return a3c.A2CTrainer
def _import_pg():
from ray.rllib.agents import pg
return pg.PGTrainer
def _import_impala():
from ray.rllib.agents import impala
return impala.ImpalaTrainer
def _import_marwil():
from ray.rllib.agents import marwil
return marwil.MARWILTrainer
ALGORITHMS = {
"SAC": _import_sac,
"DDPG": _import_ddpg,
"APEX_DDPG": _import_apex_ddpg,
"TD3": _import_td3,
"PPO": _import_ppo,
"ES": _import_es,
"ARS": _import_ars,
"DQN": _import_dqn,
"SimpleQ": _import_simple_q,
"APEX": _import_apex,
"A3C": _import_a3c,
"A2C": _import_a2c,
"PG": _import_pg,
"IMPALA": _import_impala,
"QMIX": _import_qmix,
"APEX_QMIX": _import_apex_qmix,
"APPO": _import_appo,
"MARWIL": _import_marwil,
}
def get_agent_class(alg):
"""Returns the class of a known agent given its name."""
try:
return _get_agent_class(alg)
except ImportError:
from ray.rllib.agents.mock import _agent_import_failed
return _agent_import_failed(traceback.format_exc())
def _get_agent_class(alg):
if alg in ALGORITHMS:
return ALGORITHMS[alg]()
elif alg in CONTRIBUTED_ALGORITHMS:
return CONTRIBUTED_ALGORITHMS[alg]()
elif alg == "script":
from ray.tune import script_runner
return script_runner.ScriptRunner
elif alg == "__fake":
from ray.rllib.agents.mock import _MockTrainer
return _MockTrainer
elif alg == "__sigmoid_fake_data":
from ray.rllib.agents.mock import _SigmoidFakeData
return _SigmoidFakeData
elif alg == "__parameter_tuning":
from ray.rllib.agents.mock import _ParameterTuningTrainer
return _ParameterTuningTrainer
else:
raise Exception(("Unknown algorithm {}.").format(alg))
-1
View File
@@ -1 +0,0 @@
Implementation of Soft Actor-Critic (https://arxiv.org/abs/1812.05905.pdf).
-13
View File
@@ -1,13 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.sac.sac import SACTrainer, DEFAULT_CONFIG
from ray.rllib.utils import renamed_agent
SACAgent = renamed_agent(SACTrainer)
__all__ = [
"SACTrainer",
"DEFAULT_CONFIG",
]
-119
View File
@@ -1,119 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.agents.trainer import with_common_config
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
from ray.rllib.agents.sac.sac_policy import SACTFPolicy
OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
"train_batch_size", "learning_starts"
]
# yapf: disable
# __sphinx_doc_begin__
DEFAULT_CONFIG = with_common_config({
# === Model ===
"twin_q": True,
"use_state_preprocessor": False,
"policy": "GaussianLatentSpacePolicy",
# RLlib model options for the Q function
"Q_model": {
"hidden_activation": "relu",
"hidden_layer_sizes": (256, 256),
},
# RLlib model options for the policy function
"policy_model": {
"hidden_activation": "relu",
"hidden_layer_sizes": (256, 256),
},
# === Learning ===
# Update the target by \tau * policy + (1-\tau) * target_policy
"tau": 5e-3,
# Target entropy lower bound. This is the inverse of reward scale,
# and will be optimized automatically.
"target_entropy": "auto",
# Disable setting done=True at end of episode.
"no_done_at_end": True,
# N-step target updates
"n_step": 1,
# === Evaluation ===
# The evaluation stats will be reported under the "evaluation" metric key.
"evaluation_interval": 1,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 1,
# Extra configuration that disables exploration.
"evaluation_config": {
"exploration_enabled": False,
},
# === Exploration ===
# Number of env steps to optimize for before returning
"timesteps_per_iteration": 1000,
"exploration_enabled": True,
# === Replay buffer ===
# Size of the replay buffer. Note that if async_updates is set, then
# each worker will have a replay buffer of this size.
"buffer_size": int(1e6),
# If True prioritized replay buffer will be used.
# TODO(hartikainen): Make sure this works or remove the option.
"prioritized_replay": False,
"prioritized_replay_alpha": 0.6,
"prioritized_replay_beta": 0.4,
"prioritized_replay_eps": 1e-6,
"beta_annealing_fraction": 0.2,
"final_prioritized_replay_beta": 0.4,
"compress_observations": False,
# === Optimization ===
"optimization": {
"actor_learning_rate": 3e-4,
"critic_learning_rate": 3e-4,
"entropy_learning_rate": 3e-4,
},
# If not None, clip gradients during optimization at this value
"grad_norm_clipping": None,
# How many steps of the model to sample before learning starts.
"learning_starts": 1500,
# Update the replay buffer with this many samples at once. Note that this
# setting applies per-worker if num_workers > 1.
"sample_batch_size": 1,
# Size of a batched sampled from replay buffer for training. Note that
# if async_updates is set, then each worker returns gradients for a
# batch of this size.
"train_batch_size": 256,
# Update the target network every `target_network_update_freq` steps.
"target_network_update_freq": 0,
# === Parallelism ===
# Whether to use a GPU for local optimization.
"num_gpus": 0,
# Number of workers for collecting samples with. This only makes sense
# to increase if your environment is particularly slow to sample, or if
# you"re using the Async or Ape-X optimizers.
"num_workers": 0,
# Whether to allocate GPUs for workers (if > 0).
"num_gpus_per_worker": 0,
# Whether to allocate CPUs for workers (if > 0).
"num_cpus_per_worker": 1,
# Whether to compute priorities on workers.
"worker_side_prioritization": False,
# Prevent iterations from going lower than this time span
"min_iter_time_s": 1,
# TODO(ekl) these are unused; remove them from sac config
"per_worker_exploration": False,
"exploration_fraction": 0.1,
"schedule_max_timesteps": 100000,
"exploration_final_eps": 0.02,
})
# __sphinx_doc_end__
# yapf: enable
SACTrainer = GenericOffPolicyTrainer.with_updates(
name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)
-232
View File
@@ -1,232 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils import try_import_tf, try_import_tfp
tf = try_import_tf()
tfp = try_import_tfp()
SCALE_DIAG_MIN_MAX = (-20, 2)
def SquashBijector():
# lazy def since it depends on tfp
class SquashBijector(tfp.bijectors.Bijector):
def __init__(self, validate_args=False, name="tanh"):
super(SquashBijector, self).__init__(
forward_min_event_ndims=0,
validate_args=validate_args,
name=name)
def _forward(self, x):
return tf.nn.tanh(x)
def _inverse(self, y):
return tf.atanh(y)
def _forward_log_det_jacobian(self, x):
return 2. * (np.log(2.) - x - tf.nn.softplus(-2. * x))
return SquashBijector()
class SACModel(TFModelV2):
"""Extension of standard TFModel for SAC.
Data flow:
obs -> forward() -> model_out
model_out -> get_policy_output() -> pi(s)
model_out, actions -> get_q_values() -> Q(s, a)
model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
Note that this class by itself is not a valid model unless you
implement forward() in a subclass."""
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
actor_hidden_activation="relu",
actor_hiddens=(256, 256),
critic_hidden_activation="relu",
critic_hiddens=(256, 256),
twin_q=False):
"""Initialize variables of this model.
Extra model kwargs:
actor_hidden_activation (str): activation for actor network
actor_hiddens (list): hidden layers sizes for actor network
critic_hidden_activation (str): activation for critic network
critic_hiddens (list): hidden layers sizes for critic network
twin_q (bool): build twin Q networks
Note that the core layers for forward() are not defined here, this
only defines the layers for the output heads. Those layers for
forward() should be defined in subclasses of SACModel.
"""
if tfp is None:
raise ImportError("tensorflow-probability package not found")
super(SACModel, self).__init__(obs_space, action_space, num_outputs,
model_config, name)
self.action_dim = np.product(action_space.shape)
self.model_out = tf.keras.layers.Input(
shape=(num_outputs, ), name="model_out")
self.actions = tf.keras.layers.Input(
shape=(self.action_dim, ), name="actions")
shift_and_log_scale_diag = tf.keras.Sequential([
tf.keras.layers.Dense(
units=hidden,
activation=getattr(tf.nn, actor_hidden_activation),
name="action_hidden_{}".format(i))
for i, hidden in enumerate(actor_hiddens)
] + [
tf.keras.layers.Dense(
units=tfp.layers.MultivariateNormalTriL.params_size(
self.action_dim),
activation=None,
name="action_out")
])(self.model_out)
shift, log_scale_diag = tf.keras.layers.Lambda(
lambda shift_and_log_scale_diag: tf.split(
shift_and_log_scale_diag,
num_or_size_splits=2,
axis=-1)
)(shift_and_log_scale_diag)
log_scale_diag = tf.keras.layers.Lambda(
lambda log_sd: tf.clip_by_value(log_sd, *SCALE_DIAG_MIN_MAX))(
log_scale_diag)
shift_and_log_scale_diag = tf.keras.layers.Concatenate(axis=-1)(
[shift, log_scale_diag])
raw_action_distribution = tfp.layers.MultivariateNormalTriL(
self.action_dim)(shift_and_log_scale_diag)
action_distribution = tfp.layers.DistributionLambda(
make_distribution_fn=SquashBijector())(raw_action_distribution)
# TODO(hartikainen): Remove the unnecessary Model call here
self.action_distribution_model = tf.keras.Model(
self.model_out, action_distribution)
self.register_variables(self.action_distribution_model.variables)
def build_q_net(name, observations, actions):
q_net = tf.keras.Sequential([
tf.keras.layers.Concatenate(axis=1),
] + [
tf.keras.layers.Dense(
units=units,
activation=getattr(tf.nn, critic_hidden_activation),
name="{}_hidden_{}".format(name, i))
for i, units in enumerate(critic_hiddens)
] + [
tf.keras.layers.Dense(
units=1, activation=None, name="{}_out".format(name))
])
# TODO(hartikainen): Remove the unnecessary Model call here
q_net = tf.keras.Model([observations, actions],
q_net([observations, actions]))
return q_net
self.q_net = build_q_net("q", self.model_out, self.actions)
self.register_variables(self.q_net.variables)
if twin_q:
self.twin_q_net = build_q_net("twin_q", self.model_out,
self.actions)
self.register_variables(self.twin_q_net.variables)
else:
self.twin_q_net = None
self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha")
self.alpha = tf.exp(self.log_alpha)
self.register_variables([self.log_alpha])
def forward(self, input_dict, state, seq_lens):
"""This generates the model_out tensor input.
You must implement this as documented in modelv2.py."""
raise NotImplementedError
def get_policy_output(self, model_out, deterministic=False):
"""Return the (unscaled) output of the policy network.
This returns the unscaled outputs of pi(s).
Arguments:
model_out (Tensor): obs embeddings from the model layers, of shape
[BATCH_SIZE, num_outputs].
Returns:
tensor of shape [BATCH_SIZE, action_dim] with range [-inf, inf].
"""
action_distribution = self.action_distribution_model(model_out)
if deterministic:
actions = action_distribution.bijector(
action_distribution.distribution.mean())
log_pis = None
else:
actions = action_distribution.sample()
log_pis = action_distribution.log_prob(actions)
return actions, log_pis
def get_q_values(self, model_out, actions):
"""Return the Q estimates for the most recent forward pass.
This implements Q(s, a).
Arguments:
model_out (Tensor): obs embeddings from the model layers, of shape
[BATCH_SIZE, num_outputs].
actions (Tensor): action values that correspond with the most
recent batch of observations passed through forward(), of shape
[BATCH_SIZE, action_dim].
Returns:
tensor of shape [BATCH_SIZE].
"""
return self.q_net([model_out, actions])
def get_twin_q_values(self, model_out, actions):
"""Same as get_q_values but using the twin Q net.
This implements the twin Q(s, a).
Arguments:
model_out (Tensor): obs embeddings from the model layers, of shape
[BATCH_SIZE, num_outputs].
actions (Tensor): action values that correspond with the most
recent batch of observations passed through forward(), of shape
[BATCH_SIZE, action_dim].
Returns:
tensor of shape [BATCH_SIZE].
"""
return self.twin_q_net([model_out, actions])
def policy_variables(self):
"""Return the list of variables for the policy net."""
return list(self.action_distribution_model.variables)
def q_variables(self):
"""Return the list of variables for Q / twin Q nets."""
return self.q_net.variables + (self.twin_q_net.variables
if self.twin_q_net else [])
-367
View File
@@ -1,367 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from gym.spaces import Box
import numpy as np
import logging
import ray
import ray.experimental.tf_utils
from ray.rllib.agents.sac.sac_model import SACModel
from ray.rllib.agents.ddpg.noop_model import NoopModel
from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.utils import try_import_tf, try_import_tfp
from ray.rllib.utils.tf_ops import minimize_and_clip
tf = try_import_tf()
tfp = try_import_tfp()
logger = logging.getLogger(__name__)
def build_sac_model(policy, obs_space, action_space, config):
if config["model"]["custom_model"]:
logger.warning(
"Setting use_state_preprocessor=True since a custom model "
"was specified.")
config["use_state_preprocessor"] = True
if not isinstance(action_space, Box):
raise UnsupportedSpaceException(
"Action space {} is not supported for SAC.".format(action_space))
if len(action_space.shape) > 1:
raise UnsupportedSpaceException(
"Action space has multiple dimensions "
"{}. ".format(action_space.shape) +
"Consider reshaping this into a single dimension, "
"using a Tuple action space, or the multi-agent API.")
if config["use_state_preprocessor"]:
default_model = None # catalog decides
num_outputs = 256 # arbitrary
config["model"]["no_final_linear"] = True
else:
default_model = NoopModel
num_outputs = int(np.product(obs_space.shape))
policy.model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
model_interface=SACModel,
default_model=default_model,
name="sac_model",
actor_hidden_activation=config["policy_model"]["hidden_activation"],
actor_hiddens=config["policy_model"]["hidden_layer_sizes"],
critic_hidden_activation=config["Q_model"]["hidden_activation"],
critic_hiddens=config["Q_model"]["hidden_layer_sizes"],
twin_q=config["twin_q"])
policy.target_model = ModelCatalog.get_model_v2(
obs_space,
action_space,
num_outputs,
config["model"],
framework="tf",
model_interface=SACModel,
default_model=default_model,
name="target_sac_model",
actor_hidden_activation=config["policy_model"]["hidden_activation"],
actor_hiddens=config["policy_model"]["hidden_layer_sizes"],
critic_hidden_activation=config["Q_model"]["hidden_activation"],
critic_hiddens=config["Q_model"]["hidden_layer_sizes"],
twin_q=config["twin_q"])
return policy.model
def postprocess_trajectory(policy,
sample_batch,
other_agent_batches=None,
episode=None):
return _postprocess_dqn(policy, sample_batch)
def exploration_setting_inputs(policy):
return {
policy.stochastic: policy.config["exploration_enabled"],
}
def build_action_output(policy, model, input_dict, obs_space, action_space,
config):
model_out, _ = model({
"obs": input_dict[SampleBatch.CUR_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
def unsquash_actions(actions):
# Use sigmoid to scale to [0,1], but also double magnitude of input to
# emulate behaviour of tanh activation used in SAC and TD3 papers.
sigmoid_out = tf.nn.sigmoid(2 * actions)
# Rescale to actual env policy scale
# (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
# get same dims)
action_range = (action_space.high - action_space.low)[None]
low_action = action_space.low[None]
unsquashed_actions = action_range * sigmoid_out + low_action
return unsquashed_actions
squashed_stochastic_actions, log_pis = policy.model.get_policy_output(
model_out, deterministic=False)
stochastic_actions = unsquash_actions(squashed_stochastic_actions)
squashed_deterministic_actions, _ = policy.model.get_policy_output(
model_out, deterministic=True)
deterministic_actions = unsquash_actions(squashed_deterministic_actions)
actions = tf.cond(policy.stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
action_probabilities = tf.cond(policy.stochastic, lambda: log_pis,
lambda: tf.zeros_like(log_pis))
policy.output_actions = actions
return actions, action_probabilities
def actor_critic_loss(policy, batch_tensors):
model_out_t, _ = policy.model({
"obs": batch_tensors[SampleBatch.CUR_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
model_out_tp1, _ = policy.model({
"obs": batch_tensors[SampleBatch.NEXT_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
target_model_out_tp1, _ = policy.target_model({
"obs": batch_tensors[SampleBatch.NEXT_OBS],
"is_training": policy._get_is_training_placeholder(),
}, [], None)
# TODO(hartikainen): figure actions and log pis
policy_t, log_pis_t = policy.model.get_policy_output(model_out_t)
policy_tp1, log_pis_tp1 = policy.model.get_policy_output(model_out_tp1)
log_alpha = policy.model.log_alpha
alpha = policy.model.alpha
# q network evaluation
q_t = policy.model.get_q_values(model_out_t,
batch_tensors[SampleBatch.ACTIONS])
if policy.config["twin_q"]:
twin_q_t = policy.model.get_twin_q_values(
model_out_t, batch_tensors[SampleBatch.ACTIONS])
# Q-values for current policy (no noise) in given current state
q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t)
# target q network evaluation
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1)
if policy.config["twin_q"]:
twin_q_tp1 = policy.target_model.get_twin_q_values(
target_model_out_tp1, policy_tp1)
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
if policy.config["twin_q"]:
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
q_tp1 -= tf.expand_dims(alpha * log_pis_t, 1)
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES],
tf.float32)) * q_tp1_best
assert policy.config["n_step"] == 1, "TODO(hartikainen) n_step > 1"
# compute RHS of bellman equation
q_t_selected_target = tf.stop_gradient(
batch_tensors[SampleBatch.REWARDS] +
policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
# compute the error (potentially clipped)
if policy.config["twin_q"]:
td_error = q_t_selected - q_t_selected_target
twin_td_error = twin_q_t_selected - q_t_selected_target
td_error = td_error + twin_td_error
errors = 0.5 * (tf.square(td_error) + tf.square(twin_td_error))
else:
td_error = q_t_selected - q_t_selected_target
errors = 0.5 * tf.square(td_error)
critic_loss = policy.model.custom_loss(
tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors)
actor_loss = tf.reduce_mean(alpha * log_pis_t - q_t_det_policy)
target_entropy = (-np.prod(policy.action_space.shape)
if policy.config["target_entropy"] == "auto" else
policy.config["target_entropy"])
alpha_loss = -tf.reduce_mean(
log_alpha * tf.stop_gradient(log_pis_t + target_entropy))
# save for stats function
policy.q_t = q_t
policy.td_error = td_error
policy.actor_loss = actor_loss
policy.critic_loss = critic_loss
policy.alpha_loss = alpha_loss
# in a custom apply op we handle the losses separately, but return them
# combined in one loss for now
return actor_loss + critic_loss + alpha_loss
def gradients(policy, optimizer, loss):
if policy.config["grad_norm_clipping"] is not None:
actor_grads_and_vars = minimize_and_clip(
policy._actor_optimizer,
policy.actor_loss,
var_list=policy.model.policy_variables(),
clip_val=policy.config["grad_norm_clipping"])
critic_grads_and_vars = minimize_and_clip(
policy._critic_optimizer,
policy.critic_loss,
var_list=policy.model.q_variables(),
clip_val=policy.config["grad_norm_clipping"])
alpha_grads_and_vars = minimize_and_clip(
policy._alpha_optimizer,
policy.alpha_loss,
var_list=policy.model.alpha,
clip_val=policy.config["grad_norm_clipping"])
else:
actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
policy.actor_loss, var_list=policy.model.policy_variables())
critic_grads_and_vars = policy._critic_optimizer.compute_gradients(
policy.critic_loss, var_list=policy.model.q_variables())
alpha_grads_and_vars = policy._critic_optimizer.compute_gradients(
policy.alpha_loss, var_list=policy.model.alpha)
# save these for later use in build_apply_op
policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
if g is not None]
policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
if g is not None]
policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars
if g is not None]
grads_and_vars = (
policy._actor_grads_and_vars + policy._critic_grads_and_vars +
policy._alpha_grads_and_vars)
return grads_and_vars
def stats(policy, batch_tensors):
return {
"td_error": tf.reduce_mean(policy.td_error),
"actor_loss": tf.reduce_mean(policy.actor_loss),
"critic_loss": tf.reduce_mean(policy.critic_loss),
"mean_q": tf.reduce_mean(policy.q_t),
"max_q": tf.reduce_max(policy.q_t),
"min_q": tf.reduce_min(policy.q_t),
}
class ExplorationStateMixin(object):
def __init__(self, obs_space, action_space, config):
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
def set_epsilon(self, epsilon):
pass
class TargetNetworkMixin(object):
def __init__(self, config):
# update_target_fn will be called periodically to copy Q network to
# target Q network
self.tau_value = config.get("tau")
self.tau = tf.placeholder(tf.float32, (), name="tau")
update_target_expr = []
model_vars = self.model.trainable_variables()
target_model_vars = self.target_model.trainable_variables()
assert len(model_vars) == len(target_model_vars), \
(model_vars, target_model_vars)
for var, var_target in zip(model_vars, target_model_vars):
update_target_expr.append(
var_target.assign(self.tau * var +
(1.0 - self.tau) * var_target))
logger.debug("Update target op {}".format(var_target))
self.update_target_expr = tf.group(*update_target_expr)
# Hard initial update
self.update_target(tau=1.0)
# support both hard and soft sync
def update_target(self, tau=None):
tau = tau or self.tau_value
return self.get_session().run(
self.update_target_expr, feed_dict={self.tau: tau})
class ActorCriticOptimizerMixin(object):
def __init__(self, config):
# create global step for counting the number of update operations
self.global_step = tf.train.get_or_create_global_step()
# use separate optimizers for actor & critic
self._actor_optimizer = tf.train.AdamOptimizer(
learning_rate=config["optimization"]["actor_learning_rate"])
self._critic_optimizer = tf.train.AdamOptimizer(
learning_rate=config["optimization"]["critic_learning_rate"])
self._alpha_optimizer = tf.train.AdamOptimizer(
learning_rate=config["optimization"]["entropy_learning_rate"])
class ComputeTDErrorMixin(object):
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
if not self.loss_initialized():
return np.zeros_like(rew_t)
td_err = self.get_session().run(
self.td_error,
feed_dict={
self.get_placeholder(SampleBatch.CUR_OBS): [
np.array(ob) for ob in obs_t
],
self.get_placeholder(SampleBatch.ACTIONS): act_t,
self.get_placeholder(SampleBatch.REWARDS): rew_t,
self.get_placeholder(SampleBatch.NEXT_OBS): [
np.array(ob) for ob in obs_tp1
],
self.get_placeholder(SampleBatch.DONES): done_mask,
self.get_placeholder(PRIO_WEIGHTS): importance_weights
})
return td_err
def setup_early_mixins(policy, obs_space, action_space, config):
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
ActorCriticOptimizerMixin.__init__(policy, config)
def setup_late_mixins(policy, obs_space, action_space, config):
TargetNetworkMixin.__init__(policy, config)
SACTFPolicy = build_tf_policy(
name="SACTFPolicy",
get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
make_model=build_sac_model,
postprocess_fn=postprocess_trajectory,
extra_action_feed_fn=exploration_setting_inputs,
action_sampler_fn=build_action_output,
loss_fn=actor_critic_loss,
stats_fn=stats,
gradients_fn=gradients,
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
mixins=[
TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
ComputeTDErrorMixin
],
before_init=setup_early_mixins,
after_init=setup_late_mixins,
obs_include_prev_action_reward=False)
-797
View File
@@ -1,797 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datetime import datetime
import copy
import logging
import os
import pickle
import six
import time
import tempfile
import ray
from ray.exceptions import RayError
from ray.rllib.models import MODEL_DEFAULTS
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI
from ray.rllib.utils import FilterManager, deep_update, merge_dicts
from ray.rllib.utils.memory import ray_get_and_free
from ray.rllib.utils import try_import_tf
from ray.tune.registry import ENV_CREATOR, register_env, _global_registry
from ray.tune.trainable import Trainable
from ray.tune.trial import ExportFormat
from ray.tune.resources import Resources
from ray.tune.logger import UnifiedLogger
from ray.tune.result import DEFAULT_RESULTS_DIR
tf = try_import_tf()
logger = logging.getLogger(__name__)
# Max number of times to retry a worker failure. We shouldn't try too many
# times in a row since that would indicate a persistent cluster issue.
MAX_WORKER_FAILURE_RETRIES = 3
# yapf: disable
# __sphinx_doc_begin__
COMMON_CONFIG = {
# === Debugging ===
# Whether to write episode stats and videos to the agent log dir
"monitor": False,
# Set the ray.rllib.* log level for the agent process and its workers.
# Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
# periodically print out summaries of relevant internal dataflow (this is
# also printed out once at startup at the INFO level).
"log_level": "INFO",
# Callbacks that will be run during various phases of training. These all
# take a single "info" dict as an argument. For episode callbacks, custom
# metrics can be attached to the episode by updating the episode object's
# custom metrics dict (see examples/custom_metrics_and_callbacks.py). You
# may also mutate the passed in batch data in your callback.
"callbacks": {
"on_episode_start": None, # arg: {"env": .., "episode": ...}
"on_episode_step": None, # arg: {"env": .., "episode": ...}
"on_episode_end": None, # arg: {"env": .., "episode": ...}
"on_sample_end": None, # arg: {"samples": .., "worker": ...}
"on_train_result": None, # arg: {"trainer": ..., "result": ...}
"on_postprocess_traj": None, # arg: {
# "agent_id": ..., "episode": ...,
# "pre_batch": (before processing),
# "post_batch": (after processing),
# "all_pre_batches": (other agent ids),
# }
},
# Whether to attempt to continue training if a worker crashes.
"ignore_worker_failures": False,
# Log system resource metrics to results.
"log_sys_usage": True,
# === Policy ===
# Arguments to pass to model. See models/catalog.py for a full list of the
# available model options.
"model": MODEL_DEFAULTS,
# Arguments to pass to the policy optimizer. These vary by optimizer.
"optimizer": {},
# === Environment ===
# Discount factor of the MDP
"gamma": 0.99,
# Number of steps after which the episode is forced to terminate. Defaults
# to `env.spec.max_episode_steps` (if present) for Gym envs.
"horizon": None,
# Calculate rewards but don't reset the environment when the horizon is
# hit. This allows value estimation and RNN state to span across logical
# episodes denoted by horizon. This only has an effect if horizon != inf.
"soft_horizon": False,
# Don't set 'done' at the end of the episode. Note that you still need to
# set this if soft_horizon=True, unless your env is actually running
# forever without returning done=True.
"no_done_at_end": False,
# Arguments to pass to the env creator
"env_config": {},
# Environment name can also be passed via config
"env": None,
# Whether to clip rewards prior to experience postprocessing. Setting to
# None means clip for Atari only.
"clip_rewards": None,
# Whether to np.clip() actions to the action space low/high range spec.
"clip_actions": True,
# Whether to use rllib or deepmind preprocessors by default
"preprocessor_pref": "deepmind",
# The default learning rate
"lr": 0.0001,
# === Evaluation ===
# Evaluate with every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that evaluation is currently not parallelized, and that for Ape-X
# metrics are already only reported for the lowest epsilon workers.
"evaluation_interval": None,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 10,
# Extra arguments to pass to evaluation workers.
# Typical usage is to pass extra args to evaluation env creator
# and to disable exploration by computing deterministic actions
# TODO(kismuz): implement determ. actions and include relevant keys hints
"evaluation_config": {},
# === Resources ===
# Number of actors used for parallelism
"num_workers": 2,
# Number of GPUs to allocate to the driver. Note that not all algorithms
# can take advantage of driver GPUs. This can be fraction (e.g., 0.3 GPUs).
"num_gpus": 0,
# Number of CPUs to allocate per worker.
"num_cpus_per_worker": 1,
# Number of GPUs to allocate per worker. This can be fractional.
"num_gpus_per_worker": 0,
# Any custom resources to allocate per worker.
"custom_resources_per_worker": {},
# Number of CPUs to allocate for the driver. Note: this only takes effect
# when running in Tune.
"num_cpus_for_driver": 1,
# === Execution ===
# Number of environments to evaluate vectorwise per worker.
"num_envs_per_worker": 1,
# Default sample batch size (unroll length). Batches of this size are
# collected from workers until train_batch_size is met. When using
# multiple envs per worker, this is multiplied by num_envs_per_worker.
"sample_batch_size": 200,
# Training batch size, if applicable. Should be >= sample_batch_size.
# Samples batches will be concatenated together to this size for training.
"train_batch_size": 200,
# Whether to rollout "complete_episodes" or "truncate_episodes"
"batch_mode": "truncate_episodes",
# Use a background thread for sampling (slightly off-policy, usually not
# advisable to turn on unless your env specifically requires it)
"sample_async": False,
# Element-wise observation filter, either "NoFilter" or "MeanStdFilter"
"observation_filter": "NoFilter",
# Whether to synchronize the statistics of remote filters.
"synchronize_filters": True,
# Configure TF for single-process operation by default
"tf_session_args": {
# note: overriden by `local_tf_session_args`
"intra_op_parallelism_threads": 2,
"inter_op_parallelism_threads": 2,
"gpu_options": {
"allow_growth": True,
},
"log_device_placement": False,
"device_count": {
"CPU": 1
},
"allow_soft_placement": True, # required by PPO multi-gpu
},
# Override the following tf session args on the local worker
"local_tf_session_args": {
# Allow a higher level of parallelism by default, but not unlimited
# since that can cause crashes with many concurrent drivers.
"intra_op_parallelism_threads": 8,
"inter_op_parallelism_threads": 8,
},
# Whether to LZ4 compress individual observations
"compress_observations": False,
# Wait for metric batches for at most this many seconds. Those that
# have not returned in time will be collected in the next iteration.
"collect_metrics_timeout": 180,
# Smooth metrics over this many episodes.
"metrics_smoothing_episodes": 100,
# If using num_envs_per_worker > 1, whether to create those new envs in
# remote processes instead of in the same worker. This adds overheads, but
# can make sense if your envs can take much time to step / reset
# (e.g., for StarCraft). Use this cautiously; overheads are significant.
"remote_worker_envs": False,
# Timeout that remote workers are waiting when polling environments.
# 0 (continue when at least one env is ready) is a reasonable default,
# but optimal value could be obtained by measuring your environment
# step / reset and model inference perf.
"remote_env_batch_wait_ms": 0,
# Minimum time per iteration
"min_iter_time_s": 0,
# Minimum env steps to optimize for per train call. This value does
# not affect learning, only the length of iterations.
"timesteps_per_iteration": 0,
# This argument, in conjunction with worker_index, sets the random seed of
# each worker, so that identically configured trials will have identical
# results. This makes experiments reproducible.
"seed": None,
# === Offline Datasets ===
# Specify how to generate experiences:
# - "sampler": generate experiences via online simulation (default)
# - a local directory or file glob expression (e.g., "/tmp/*.json")
# - a list of individual file paths/URIs (e.g., ["/tmp/1.json",
# "s3://bucket/2.json"])
# - a dict with string keys and sampling probabilities as values (e.g.,
# {"sampler": 0.4, "/tmp/*.json": 0.4, "s3://bucket/expert.json": 0.2}).
# - a function that returns a rllib.offline.InputReader
"input": "sampler",
# Specify how to evaluate the current policy. This only has an effect when
# reading offline experiences. Available options:
# - "wis": the weighted step-wise importance sampling estimator.
# - "is": the step-wise importance sampling estimator.
# - "simulation": run the environment in the background, but use
# this data for evaluation only and not for learning.
"input_evaluation": ["is", "wis"],
# Whether to run postprocess_trajectory() on the trajectory fragments from
# offline inputs. Note that postprocessing will be done using the *current*
# policy, not the *behaviour* policy, which is typically undesirable for
# on-policy algorithms.
"postprocess_inputs": False,
# If positive, input batches will be shuffled via a sliding window buffer
# of this number of batches. Use this if the input data is not in random
# enough order. Input is delayed until the shuffle buffer is filled.
"shuffle_buffer_size": 0,
# Specify where experiences should be saved:
# - None: don't save any experiences
# - "logdir" to save to the agent log dir
# - a path/URI to save to a custom output directory (e.g., "s3://bucket/")
# - a function that returns a rllib.offline.OutputWriter
"output": None,
# What sample batch columns to LZ4 compress in the output data.
"output_compress_columns": ["obs", "new_obs"],
# Max output file size before rolling over to a new file.
"output_max_file_size": 64 * 1024 * 1024,
# === Multiagent ===
"multiagent": {
# Map from policy ids to tuples of (policy_cls, obs_space,
# act_space, config). See rollout_worker.py for more info.
"policies": {},
# Function mapping agent ids to policy ids.
"policy_mapping_fn": None,
# Optional whitelist of policies to train, or None for all policies.
"policies_to_train": None,
},
}
# __sphinx_doc_end__
# yapf: enable
@DeveloperAPI
def with_common_config(extra_config):
"""Returns the given config dict merged with common agent confs."""
return with_base_config(COMMON_CONFIG, extra_config)
def with_base_config(base_config, extra_config):
"""Returns the given config dict merged with a base agent conf."""
config = copy.deepcopy(base_config)
config.update(extra_config)
return config
@PublicAPI
class Trainer(Trainable):
"""A trainer coordinates the optimization of one or more RL policies.
All RLlib trainers extend this base class, e.g., the A3CTrainer implements
the A3C algorithm for single and multi-agent training.
Trainer objects retain internal model state between calls to train(), so
you should create a new trainer instance for each training session.
Attributes:
env_creator (func): Function that creates a new training env.
config (obj): Algorithm-specific configuration data.
logdir (str): Directory in which training outputs should be placed.
"""
_allow_unknown_configs = False
_allow_unknown_subkeys = [
"tf_session_args", "env_config", "model", "optimizer", "multiagent",
"custom_resources_per_worker", "evaluation_config"
]
@PublicAPI
def __init__(self, config=None, env=None, logger_creator=None):
"""Initialize an RLLib trainer.
Args:
config (dict): Algorithm-specific configuration data.
env (str): Name of the environment to use. Note that this can also
be specified as the `env` key in config.
logger_creator (func): Function that creates a ray.tune.Logger
object. If unspecified, a default logger is created.
"""
config = config or {}
# Vars to synchronize to workers on each train call
self.global_vars = {"timestep": 0}
# Trainers allow env ids to be passed directly to the constructor.
self._env_id = self._register_if_needed(env or config.get("env"))
# Create a default logger creator if no logger_creator is specified
if logger_creator is None:
timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
logdir_prefix = "{}_{}_{}".format(self._name, self._env_id,
timestr)
def default_logger_creator(config):
"""Creates a Unified logger with a default logdir prefix
containing the agent name and the env id
"""
if not os.path.exists(DEFAULT_RESULTS_DIR):
os.makedirs(DEFAULT_RESULTS_DIR)
logdir = tempfile.mkdtemp(
prefix=logdir_prefix, dir=DEFAULT_RESULTS_DIR)
return UnifiedLogger(config, logdir, None)
logger_creator = default_logger_creator
Trainable.__init__(self, config, logger_creator)
@classmethod
@override(Trainable)
def default_resource_request(cls, config):
cf = dict(cls._default_config, **config)
Trainer._validate_config(cf)
# TODO(ekl): add custom resources here once tune supports them
return Resources(
cpu=cf["num_cpus_for_driver"],
gpu=cf["num_gpus"],
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
@override(Trainable)
@PublicAPI
def train(self):
"""Overrides super.train to synchronize global vars."""
if self._has_policy_optimizer():
self.global_vars["timestep"] = self.optimizer.num_steps_sampled
self.optimizer.workers.local_worker().set_global_vars(
self.global_vars)
for w in self.optimizer.workers.remote_workers():
w.set_global_vars.remote(self.global_vars)
logger.debug("updated global vars: {}".format(self.global_vars))
result = None
for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
try:
result = Trainable.train(self)
except RayError as e:
if self.config["ignore_worker_failures"]:
logger.exception(
"Error in train call, attempting to recover")
self._try_recover()
else:
logger.info(
"Worker crashed during call to train(). To attempt to "
"continue training without the failed worker, set "
"`'ignore_worker_failures': True`.")
raise e
except Exception as e:
time.sleep(0.5) # allow logs messages to propagate
raise e
else:
break
if result is None:
raise RuntimeError("Failed to recover from worker crash")
if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
and hasattr(self, "workers")
and isinstance(self.workers, WorkerSet)):
FilterManager.synchronize(
self.workers.local_worker().filters,
self.workers.remote_workers(),
update_remote=self.config["synchronize_filters"])
logger.debug("synchronized filters: {}".format(
self.workers.local_worker().filters))
if self._has_policy_optimizer():
result["num_healthy_workers"] = len(
self.optimizer.workers.remote_workers())
if self.config["evaluation_interval"]:
if self._iteration % self.config["evaluation_interval"] == 0:
evaluation_metrics = self._evaluate()
assert isinstance(evaluation_metrics, dict), \
"_evaluate() needs to return a dict."
result.update(evaluation_metrics)
return result
@override(Trainable)
def _log_result(self, result):
if self.config["callbacks"].get("on_train_result"):
self.config["callbacks"]["on_train_result"]({
"trainer": self,
"result": result,
})
# log after the callback is invoked, so that the user has a chance
# to mutate the result
Trainable._log_result(self, result)
@override(Trainable)
def _setup(self, config):
env = self._env_id
if env:
config["env"] = env
if _global_registry.contains(ENV_CREATOR, env):
self.env_creator = _global_registry.get(ENV_CREATOR, env)
else:
import gym # soft dependency
self.env_creator = lambda env_config: gym.make(env)
else:
self.env_creator = lambda env_config: None
# Merge the supplied config with the class default
merged_config = copy.deepcopy(self._default_config)
merged_config = deep_update(merged_config, config,
self._allow_unknown_configs,
self._allow_unknown_subkeys)
self.raw_user_config = config
self.config = merged_config
Trainer._validate_config(self.config)
if self.config.get("log_level"):
logging.getLogger("ray.rllib").setLevel(self.config["log_level"])
def get_scope():
if tf:
return tf.Graph().as_default()
else:
return open("/dev/null") # fake a no-op scope
with get_scope():
self._init(self.config, self.env_creator)
# Evaluation related
if self.config.get("evaluation_interval"):
# Update env_config with evaluation settings:
extra_config = copy.deepcopy(self.config["evaluation_config"])
extra_config.update({
"batch_mode": "complete_episodes",
"batch_steps": 1,
})
logger.debug(
"using evaluation_config: {}".format(extra_config))
self.evaluation_workers = self._make_workers(
self.env_creator,
self._policy,
merge_dicts(self.config, extra_config),
num_workers=0)
self.evaluation_metrics = self._evaluate()
@override(Trainable)
def _stop(self):
if hasattr(self, "workers"):
self.workers.stop()
if hasattr(self, "optimizer"):
self.optimizer.stop()
@override(Trainable)
def _save(self, checkpoint_dir):
checkpoint_path = os.path.join(checkpoint_dir,
"checkpoint-{}".format(self.iteration))
pickle.dump(self.__getstate__(), open(checkpoint_path, "wb"))
return checkpoint_path
@override(Trainable)
def _restore(self, checkpoint_path):
extra_data = pickle.load(open(checkpoint_path, "rb"))
self.__setstate__(extra_data)
@DeveloperAPI
def _make_workers(self, env_creator, policy, config, num_workers):
return WorkerSet(
env_creator,
policy,
config,
num_workers=num_workers,
logdir=self.logdir)
@DeveloperAPI
def _init(self, config, env_creator):
"""Subclasses should override this for custom initialization."""
raise NotImplementedError
@DeveloperAPI
def _evaluate(self):
"""Evaluates current policy under `evaluation_config` settings.
Note that this default implementation does not do anything beyond
merging evaluation_config with the normal trainer config.
"""
if not self.config["evaluation_config"]:
raise ValueError(
"No evaluation_config specified. It doesn't make sense "
"to enable evaluation without specifying any config "
"overrides, since the results will be the "
"same as reported during normal policy evaluation.")
logger.info("Evaluating current policy for {} episodes".format(
self.config["evaluation_num_episodes"]))
self._before_evaluate()
self.evaluation_workers.local_worker().restore(
self.workers.local_worker().save())
for _ in range(self.config["evaluation_num_episodes"]):
self.evaluation_workers.local_worker().sample()
metrics = collect_metrics(self.evaluation_workers.local_worker())
return {"evaluation": metrics}
@DeveloperAPI
def _before_evaluate(self):
"""Pre-evaluation callback."""
pass
@PublicAPI
def compute_action(self,
observation,
state=None,
prev_action=None,
prev_reward=None,
info=None,
policy_id=DEFAULT_POLICY_ID,
full_fetch=False):
"""Computes an action for the specified policy.
Note that you can also access the policy object through
self.get_policy(policy_id) and call compute_actions() on it directly.
Arguments:
observation (obj): observation from the environment.
state (list): RNN hidden state, if any. If state is not None,
then all of compute_single_action(...) is returned
(computed action, rnn state, logits dictionary).
Otherwise compute_single_action(...)[0] is
returned (computed action).
prev_action (obj): previous action value, if any
prev_reward (int): previous reward, if any
info (dict): info object, if any
policy_id (str): policy to query (only applies to multi-agent).
full_fetch (bool): whether to return extra action fetch results.
This is always set to true if RNN state is specified.
Returns:
Just the computed action if full_fetch=False, or the full output
of policy.compute_actions() otherwise.
"""
if state is None:
state = []
preprocessed = self.workers.local_worker().preprocessors[
policy_id].transform(observation)
filtered_obs = self.workers.local_worker().filters[policy_id](
preprocessed, update=False)
if state:
return self.get_policy(policy_id).compute_single_action(
filtered_obs,
state,
prev_action,
prev_reward,
info,
clip_actions=self.config["clip_actions"])
res = self.get_policy(policy_id).compute_single_action(
filtered_obs,
state,
prev_action,
prev_reward,
info,
clip_actions=self.config["clip_actions"])
if full_fetch:
return res
else:
return res[0] # backwards compatibility
@property
def _name(self):
"""Subclasses should override this to declare their name."""
raise NotImplementedError
@property
def _default_config(self):
"""Subclasses should override this to declare their default config."""
raise NotImplementedError
@PublicAPI
def get_policy(self, policy_id=DEFAULT_POLICY_ID):
"""Return policy for the specified id, or None.
Arguments:
policy_id (str): id of policy to return.
"""
return self.workers.local_worker().get_policy(policy_id)
@PublicAPI
def get_weights(self, policies=None):
"""Return a dictionary of policy ids to weights.
Arguments:
policies (list): Optional list of policies to return weights for,
or None for all policies.
"""
return self.workers.local_worker().get_weights(policies)
@PublicAPI
def set_weights(self, weights):
"""Set policy weights by policy id.
Arguments:
weights (dict): Map of policy ids to weights to set.
"""
self.workers.local_worker().set_weights(weights)
@DeveloperAPI
def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID):
"""Export policy model with given policy_id to local directory.
Arguments:
export_dir (string): Writable local directory.
policy_id (string): Optional policy id to export.
Example:
>>> trainer = MyTrainer()
>>> for _ in range(10):
>>> trainer.train()
>>> trainer.export_policy_model("/tmp/export_dir")
"""
self.workers.local_worker().export_policy_model(export_dir, policy_id)
@DeveloperAPI
def export_policy_checkpoint(self,
export_dir,
filename_prefix="model",
policy_id=DEFAULT_POLICY_ID):
"""Export tensorflow policy model checkpoint to local directory.
Arguments:
export_dir (string): Writable local directory.
filename_prefix (string): file name prefix of checkpoint files.
policy_id (string): Optional policy id to export.
Example:
>>> trainer = MyTrainer()
>>> for _ in range(10):
>>> trainer.train()
>>> trainer.export_policy_checkpoint("/tmp/export_dir")
"""
self.workers.local_worker().export_policy_checkpoint(
export_dir, filename_prefix, policy_id)
@DeveloperAPI
def collect_metrics(self, selected_workers=None):
"""Collects metrics from the remote workers of this agent.
This is the same data as returned by a call to train().
"""
return self.optimizer.collect_metrics(
self.config["collect_metrics_timeout"],
min_history=self.config["metrics_smoothing_episodes"],
selected_workers=selected_workers)
@classmethod
def resource_help(cls, config):
return ("\n\nYou can adjust the resource requests of RLlib agents by "
"setting `num_workers`, `num_gpus`, and other configs. See "
"the DEFAULT_CONFIG defined by each agent for more info.\n\n"
"The config of this agent is: {}".format(config))
@staticmethod
def _validate_config(config):
if "policy_graphs" in config["multiagent"]:
logger.warning(
"The `policy_graphs` config has been renamed to `policies`.")
# Backwards compatibility
config["multiagent"]["policies"] = config["multiagent"][
"policy_graphs"]
del config["multiagent"]["policy_graphs"]
if "gpu" in config:
raise ValueError(
"The `gpu` config is deprecated, please use `num_gpus=0|1` "
"instead.")
if "gpu_fraction" in config:
raise ValueError(
"The `gpu_fraction` config is deprecated, please use "
"`num_gpus=<fraction>` instead.")
if "use_gpu_for_workers" in config:
raise ValueError(
"The `use_gpu_for_workers` config is deprecated, please use "
"`num_gpus_per_worker=1` instead.")
if type(config["input_evaluation"]) != list:
raise ValueError(
"`input_evaluation` must be a list of strings, got {}".format(
config["input_evaluation"]))
def _try_recover(self):
"""Try to identify and blacklist any unhealthy workers.
This method is called after an unexpected remote error is encountered
from a worker. It issues check requests to all current workers and
blacklists any that respond with error. If no healthy workers remain,
an error is raised.
"""
if not self._has_policy_optimizer():
raise NotImplementedError(
"Recovery is not supported for this algorithm")
logger.info("Health checking all workers...")
checks = []
for ev in self.optimizer.workers.remote_workers():
_, obj_id = ev.sample_with_count.remote()
checks.append(obj_id)
healthy_workers = []
for i, obj_id in enumerate(checks):
w = self.optimizer.workers.remote_workers()[i]
try:
ray_get_and_free(obj_id)
healthy_workers.append(w)
logger.info("Worker {} looks healthy".format(i + 1))
except RayError:
logger.exception("Blacklisting worker {}".format(i + 1))
try:
w.__ray_terminate__.remote()
except Exception:
logger.exception("Error terminating unhealthy worker")
if len(healthy_workers) < 1:
raise RuntimeError(
"Not enough healthy workers remain to continue.")
self.optimizer.reset(healthy_workers)
def _has_policy_optimizer(self):
return hasattr(self, "optimizer") and isinstance(
self.optimizer, PolicyOptimizer)
@override(Trainable)
def _export_model(self, export_formats, export_dir):
ExportFormat.validate(export_formats)
exported = {}
if ExportFormat.CHECKPOINT in export_formats:
path = os.path.join(export_dir, ExportFormat.CHECKPOINT)
self.export_policy_checkpoint(path)
exported[ExportFormat.CHECKPOINT] = path
if ExportFormat.MODEL in export_formats:
path = os.path.join(export_dir, ExportFormat.MODEL)
self.export_policy_model(path)
exported[ExportFormat.MODEL] = path
return exported
def __getstate__(self):
state = {}
if hasattr(self, "workers"):
state["worker"] = self.workers.local_worker().save()
if hasattr(self, "optimizer") and hasattr(self.optimizer, "save"):
state["optimizer"] = self.optimizer.save()
return state
def __setstate__(self, state):
if "worker" in state:
self.workers.local_worker().restore(state["worker"])
remote_state = ray.put(state["worker"])
for r in self.workers.remote_workers():
r.restore.remote(remote_state)
if "optimizer" in state:
self.optimizer.restore(state["optimizer"])
def _register_if_needed(self, env_object):
if isinstance(env_object, six.string_types):
return env_object
elif isinstance(env_object, type):
name = env_object.__name__
register_env(name, lambda config: env_object(config))
return name
raise ValueError(
"{} is an invalid env specification. ".format(env_object) +
"You can specify a custom env as either a class "
"(e.g., YourEnvCls) or a registered env id (e.g., \"your_env\").")
-174
View File
@@ -1,174 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import time
from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
from ray.rllib.optimizers import SyncSamplesOptimizer
from ray.rllib.utils import add_mixins
from ray.rllib.utils.annotations import override, DeveloperAPI
@DeveloperAPI
def build_trainer(name,
default_policy,
default_config=None,
validate_config=None,
get_initial_state=None,
get_policy_class=None,
before_init=None,
make_workers=None,
make_policy_optimizer=None,
after_init=None,
before_train_step=None,
after_optimizer_step=None,
after_train_result=None,
collect_metrics_fn=None,
before_evaluate_fn=None,
mixins=None):
"""Helper function for defining a custom trainer.
Functions will be run in this order to initialize the trainer:
1. Config setup: validate_config, get_initial_state, get_policy
2. Worker setup: before_init, make_workers, make_policy_optimizer
3. Post setup: after_init
Arguments:
name (str): name of the trainer (e.g., "PPO")
default_policy (cls): the default Policy class to use
default_config (dict): the default config dict of the algorithm,
otherwises uses the Trainer default config
validate_config (func): optional callback that checks a given config
for correctness. It may mutate the config as needed.
get_initial_state (func): optional function that returns the initial
state dict given the trainer instance as an argument. The state
dict must be serializable so that it can be checkpointed, and will
be available as the `trainer.state` variable.
get_policy_class (func): optional callback that takes a config and
returns the policy class to override the default with
before_init (func): optional function to run at the start of trainer
init that takes the trainer instance as argument
make_workers (func): override the method that creates rollout workers.
This takes in (trainer, env_creator, policy, config) as args.
make_policy_optimizer (func): optional function that returns a
PolicyOptimizer instance given (WorkerSet, config)
after_init (func): optional function to run at the end of trainer init
that takes the trainer instance as argument
before_train_step (func): optional callback to run before each train()
call. It takes the trainer instance as an argument.
after_optimizer_step (func): optional callback to run after each
step() call to the policy optimizer. It takes the trainer instance
and the policy gradient fetches as arguments.
after_train_result (func): optional callback to run at the end of each
train() call. It takes the trainer instance and result dict as
arguments, and may mutate the result dict as needed.
collect_metrics_fn (func): override the method used to collect metrics.
It takes the trainer instance as argumnt.
before_evaluate_fn (func): callback to run before evaluation. This
takes the trainer instance as argument.
mixins (list): list of any class mixins for the returned trainer class.
These mixins will be applied in order and will have higher
precedence than the Trainer class
Returns:
a Trainer instance that uses the specified args.
"""
original_kwargs = locals().copy()
base = add_mixins(Trainer, mixins)
class trainer_cls(base):
_name = name
_default_config = default_config or COMMON_CONFIG
_policy = default_policy
def __init__(self, config=None, env=None, logger_creator=None):
Trainer.__init__(self, config, env, logger_creator)
def _init(self, config, env_creator):
if validate_config:
validate_config(config)
if get_initial_state:
self.state = get_initial_state(self)
else:
self.state = {}
if get_policy_class is None:
policy = default_policy
else:
policy = get_policy_class(config)
if before_init:
before_init(self)
if make_workers:
self.workers = make_workers(self, env_creator, policy, config)
else:
self.workers = self._make_workers(env_creator, policy, config,
self.config["num_workers"])
if make_policy_optimizer:
self.optimizer = make_policy_optimizer(self.workers, config)
else:
optimizer_config = dict(
config["optimizer"],
**{"train_batch_size": config["train_batch_size"]})
self.optimizer = SyncSamplesOptimizer(self.workers,
**optimizer_config)
if after_init:
after_init(self)
@override(Trainer)
def _train(self):
if before_train_step:
before_train_step(self)
prev_steps = self.optimizer.num_steps_sampled
start = time.time()
while True:
fetches = self.optimizer.step()
if after_optimizer_step:
after_optimizer_step(self, fetches)
if (time.time() - start >= self.config["min_iter_time_s"]
and self.optimizer.num_steps_sampled - prev_steps >=
self.config["timesteps_per_iteration"]):
break
if collect_metrics_fn:
res = collect_metrics_fn(self)
else:
res = self.collect_metrics()
res.update(
timesteps_this_iter=self.optimizer.num_steps_sampled -
prev_steps,
info=res.get("info", {}))
if after_train_result:
after_train_result(self, res)
return res
@override(Trainer)
def _before_evaluate(self):
if before_evaluate_fn:
before_evaluate_fn(self)
def __getstate__(self):
state = Trainer.__getstate__(self)
state["trainer_state"] = self.state.copy()
return state
def __setstate__(self, state):
Trainer.__setstate__(self, state)
self.state = state["trainer_state"].copy()
@staticmethod
def with_updates(**overrides):
"""Build a copy of this trainer with the specified overrides.
Arguments:
overrides (dict): use this to override any of the arguments
originally passed to build_trainer() for this policy.
"""
return build_trainer(**dict(original_kwargs, **overrides))
trainer_cls.with_updates = with_updates
trainer_cls.__name__ = name
trainer_cls.__qualname__ = name
return trainer_cls
-141
View File
@@ -1,141 +0,0 @@
{
// The version of the config file format. Do not change, unless
// you know what you are doing.
"version": 1,
// The name of the project being benchmarked
"project": "rllib",
// The project's homepage
"project_url": "http://rllib.io",
// The URL or local path of the source code repository for the
// project being benchmarked
"repo": "../../../",
// List of branches to benchmark. If not provided, defaults to "master"
// (for git) or "default" (for mercurial).
"branches": ["master"], // for git
// "branches": ["default"], // for mercurial
// The DVCS being used. If not set, it will be automatically
// determined from "repo" by looking at the protocol in the URL
// (if remote), or by looking for special directories, such as
// ".git" (if local).
"dvcs": "git",
// The tool to use to create environments. May be "conda",
// "virtualenv" or other value depending on the plugins in use.
// If missing or the empty string, the tool will be automatically
// determined by looking for tools on the PATH environment
// variable.
"environment_type": "conda",
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
//"install_timeout": 600,
// the base URL to show a commit for the project.
"show_commit_url": "http://github.com/ray-project/ray/commit/",
// The Pythons you'd like to test against. If not provided, defaults
// to the current version of Python used to run `asv`.
"pythons": ["3.6"],
// The matrix of dependencies to test. Each key is the name of a
// package (in PyPI) and the values are version numbers. An empty
// list or empty string indicates to just test against the default
// (latest) version. null indicates that the package is to not be
// installed. If the package to be tested is only available from
// PyPi, and the 'environment_type' is conda, then you can preface
// the package name by 'pip+', and the package will be installed via
// pip (with all the conda available packages installed first,
// followed by the pip installed packages).
//
// "matrix": {
// "numpy": ["1.6", "1.7"],
// "six": ["", null], // test with and without six installed
// "pip+emcee": [""], // emcee is only available for install with pip.
// },
// Combinations of libraries/python versions can be excluded/included
// from the set to test. Each entry is a dictionary containing additional
// key-value pairs to include/exclude.
//
// An exclude entry excludes entries where all values match. The
// values are regexps that should match the whole string.
//
// An include entry adds an environment. Only the packages listed
// are installed. The 'python' key is required. The exclude rules
// do not apply to includes.
//
// In addition to package names, the following keys are available:
//
// - python
// Python version, as in the *pythons* variable above.
// - environment_type
// Environment type, as above.
// - sys_platform
// Platform, as in sys.platform. Possible values for the common
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
//
// "exclude": [
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
// {"environment_type": "conda", "six": null}, // don't run without six on conda
// ],
//
// "include": [
// // additional env for python2.7
// {"python": "2.7", "numpy": "1.8"},
// // additional env if run on windows+conda
// {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
// ],
// The directory (relative to the current directory) that benchmarks are
// stored in. If not provided, defaults to "benchmarks"
"benchmark_dir": "tuned_examples/regression_tests",
// The directory (relative to the current directory) to cache the Python
// environments in. If not provided, defaults to "env"
// "env_dir": "env",
// The directory (relative to the current directory) that raw benchmark
// results are stored in. If not provided, defaults to "results".
"results_dir": "RLLIB_RESULTS",
// The directory (relative to the current directory) that the html tree
// should be written to. If not provided, defaults to "html".
// "html_dir": "html",
// The number of characters to retain in the commit hashes.
// "hash_length": 8,
// `asv` will cache wheels of the recent builds in each
// environment, making them faster to install next time. This is
// number of builds to keep, per environment.
// "wheel_cache_size": 0
// The commits after which the regression search in `asv publish`
// should start looking for regressions. Dictionary whose keys are
// regexps matching to benchmark names, and values corresponding to
// the commit (exclusive) after which to start looking for
// regressions. The default is to start from the first commit
// with results. If the commit is `null`, regression detection is
// skipped for the matching benchmark.
//
// "regressions_first_commits": {
// "some_benchmark": "352cdf", // Consider regressions only after this commit
// "another_benchmark": null, // Skip regression detection altogether
// }
// The thresholds for relative change in results, after which `asv
// publish` starts reporting regressions. Dictionary of the same
// form as in ``regressions_first_commits``, with values
// indicating the thresholds. If multiple entries match, the
// maximum is taken. If no entry matches, the default is 5%.
//
// "regressions_thresholds": {
// "some_benchmark": 0.01, // Threshold of 1%
// "another_benchmark": 0.5, // Threshold of 50%
// }
}
-3
View File
@@ -1,3 +0,0 @@
Contributed algorithms, which can be run via ``rllib train --run=contrib/<alg_name>``
See https://ray.readthedocs.io/en/latest/rllib-dev.html for guidelines.
@@ -1,52 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from ray.rllib.agents.trainer import Trainer, with_common_config
from ray.rllib.utils.annotations import override
# yapf: disable
# __sphinx_doc_begin__
class RandomAgent(Trainer):
"""Policy that takes random actions and never learns."""
_name = "RandomAgent"
_default_config = with_common_config({
"rollouts_per_iteration": 10,
})
@override(Trainer)
def _init(self, config, env_creator):
self.env = env_creator(config["env_config"])
@override(Trainer)
def _train(self):
rewards = []
steps = 0
for _ in range(self.config["rollouts_per_iteration"]):
obs = self.env.reset()
done = False
reward = 0.0
while not done:
action = self.env.action_space.sample()
obs, r, done, info = self.env.step(action)
reward += r
steps += 1
rewards.append(reward)
return {
"episode_reward_mean": np.mean(rewards),
"timesteps_this_iter": steps,
}
# __sphinx_doc_end__
# don't enable yapf after, it's buggy here
if __name__ == "__main__":
trainer = RandomAgent(
env="CartPole-v0", config={"rollouts_per_iteration": 10})
result = trainer.train()
assert result["episode_reward_mean"] > 10, result
print("Test: OK")
-15
View File
@@ -1,15 +0,0 @@
"""Registry of algorithm names for `rllib train --run=<alg_name>`"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
def _import_random_agent():
from ray.rllib.contrib.random_agent.random_agent import RandomAgent
return RandomAgent
CONTRIBUTED_ALGORITHMS = {
"contrib/RandomAgent": _import_random_agent,
}
-11
View File
@@ -1,11 +0,0 @@
from ray.rllib.env.base_env import BaseEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env.external_env import ExternalEnv
from ray.rllib.env.serving_env import ServingEnv
from ray.rllib.env.vector_env import VectorEnv
from ray.rllib.env.env_context import EnvContext
__all__ = [
"BaseEnv", "MultiAgentEnv", "ExternalEnv", "VectorEnv", "ServingEnv",
"EnvContext"
]
-291
View File
@@ -1,291 +0,0 @@
import numpy as np
from collections import deque
import gym
from gym import spaces
import cv2
cv2.ocl.setUseOpenCL(False)
def is_atari(env):
if (hasattr(env.observation_space, "shape")
and env.observation_space.shape is not None
and len(env.observation_space.shape) <= 2):
return False
return hasattr(env, "unwrapped") and hasattr(env.unwrapped, "ale")
def get_wrapper_by_cls(env, cls):
"""Returns the gym env wrapper of the given class, or None."""
currentenv = env
while True:
if isinstance(currentenv, cls):
return currentenv
elif isinstance(currentenv, gym.Wrapper):
currentenv = currentenv.env
else:
return None
class MonitorEnv(gym.Wrapper):
def __init__(self, env=None):
"""Record episodes stats prior to EpisodicLifeEnv, etc."""
gym.Wrapper.__init__(self, env)
self._current_reward = None
self._num_steps = None
self._total_steps = None
self._episode_rewards = []
self._episode_lengths = []
self._num_episodes = 0
self._num_returned = 0
def reset(self, **kwargs):
obs = self.env.reset(**kwargs)
if self._total_steps is None:
self._total_steps = sum(self._episode_lengths)
if self._current_reward is not None:
self._episode_rewards.append(self._current_reward)
self._episode_lengths.append(self._num_steps)
self._num_episodes += 1
self._current_reward = 0
self._num_steps = 0
return obs
def step(self, action):
obs, rew, done, info = self.env.step(action)
self._current_reward += rew
self._num_steps += 1
self._total_steps += 1
return (obs, rew, done, info)
def get_episode_rewards(self):
return self._episode_rewards
def get_episode_lengths(self):
return self._episode_lengths
def get_total_steps(self):
return self._total_steps
def next_episode_results(self):
for i in range(self._num_returned, len(self._episode_rewards)):
yield (self._episode_rewards[i], self._episode_lengths[i])
self._num_returned = len(self._episode_rewards)
class NoopResetEnv(gym.Wrapper):
def __init__(self, env, noop_max=30):
"""Sample initial states by taking random number of no-ops on reset.
No-op is assumed to be action 0.
"""
gym.Wrapper.__init__(self, env)
self.noop_max = noop_max
self.override_num_noops = None
self.noop_action = 0
assert env.unwrapped.get_action_meanings()[0] == "NOOP"
def reset(self, **kwargs):
""" Do no-op action for a number of steps in [1, noop_max]."""
self.env.reset(**kwargs)
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
assert noops > 0
obs = None
for _ in range(noops):
obs, _, done, _ = self.env.step(self.noop_action)
if done:
obs = self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class ClipRewardEnv(gym.RewardWrapper):
def __init__(self, env):
gym.RewardWrapper.__init__(self, env)
def reward(self, reward):
"""Bin reward to {+1, 0, -1} by its sign."""
return np.sign(reward)
class FireResetEnv(gym.Wrapper):
def __init__(self, env):
"""Take action on reset.
For environments that are fixed until firing."""
gym.Wrapper.__init__(self, env)
assert env.unwrapped.get_action_meanings()[1] == "FIRE"
assert len(env.unwrapped.get_action_meanings()) >= 3
def reset(self, **kwargs):
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(1)
if done:
self.env.reset(**kwargs)
obs, _, done, _ = self.env.step(2)
if done:
self.env.reset(**kwargs)
return obs
def step(self, ac):
return self.env.step(ac)
class EpisodicLifeEnv(gym.Wrapper):
def __init__(self, env):
"""Make end-of-life == end-of-episode, but only reset on true game over.
Done by DeepMind for the DQN and co. since it helps value estimation.
"""
gym.Wrapper.__init__(self, env)
self.lives = 0
self.was_real_done = True
def step(self, action):
obs, reward, done, info = self.env.step(action)
self.was_real_done = done
# check current lives, make loss of life terminal,
# then update lives to handle bonus lives
lives = self.env.unwrapped.ale.lives()
if lives < self.lives and lives > 0:
# for Qbert sometimes we stay in lives == 0 condtion for a few fr
# so its important to keep lives > 0, so that we only reset once
# the environment advertises done.
done = True
self.lives = lives
return obs, reward, done, info
def reset(self, **kwargs):
"""Reset only when lives are exhausted.
This way all states are still reachable even though lives are episodic,
and the learner need not know about any of this behind-the-scenes.
"""
if self.was_real_done:
obs = self.env.reset(**kwargs)
else:
# no-op step to advance from terminal/lost life state
obs, _, _, _ = self.env.step(0)
self.lives = self.env.unwrapped.ale.lives()
return obs
class MaxAndSkipEnv(gym.Wrapper):
def __init__(self, env, skip=4):
"""Return only every `skip`-th frame"""
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros(
(2, ) + env.observation_space.shape, dtype=np.uint8)
self._skip = skip
def step(self, action):
"""Repeat action, sum reward, and max over last observations."""
total_reward = 0.0
done = None
for i in range(self._skip):
obs, reward, done, info = self.env.step(action)
if i == self._skip - 2:
self._obs_buffer[0] = obs
if i == self._skip - 1:
self._obs_buffer[1] = obs
total_reward += reward
if done:
break
# Note that the observation on the done=True frame
# doesn't matter
max_frame = self._obs_buffer.max(axis=0)
return max_frame, total_reward, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class WarpFrame(gym.ObservationWrapper):
def __init__(self, env, dim):
"""Warp frames to the specified size (dim x dim)."""
gym.ObservationWrapper.__init__(self, env)
self.width = dim
self.height = dim
self.observation_space = spaces.Box(
low=0,
high=255,
shape=(self.height, self.width, 1),
dtype=np.uint8)
def observation(self, frame):
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
frame = cv2.resize(
frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
return frame[:, :, None]
class FrameStack(gym.Wrapper):
def __init__(self, env, k):
"""Stack k last frames."""
gym.Wrapper.__init__(self, env)
self.k = k
self.frames = deque([], maxlen=k)
shp = env.observation_space.shape
self.observation_space = spaces.Box(
low=0,
high=255,
shape=(shp[0], shp[1], shp[2] * k),
dtype=env.observation_space.dtype)
def reset(self):
ob = self.env.reset()
for _ in range(self.k):
self.frames.append(ob)
return self._get_ob()
def step(self, action):
ob, reward, done, info = self.env.step(action)
self.frames.append(ob)
return self._get_ob(), reward, done, info
def _get_ob(self):
assert len(self.frames) == self.k
return np.concatenate(self.frames, axis=2)
class ScaledFloatFrame(gym.ObservationWrapper):
def __init__(self, env):
gym.ObservationWrapper.__init__(self, env)
self.observation_space = gym.spaces.Box(
low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
def observation(self, observation):
# careful! This undoes the memory optimization, use
# with smaller replay buffers only.
return np.array(observation).astype(np.float32) / 255.0
def wrap_deepmind(env, dim=84, framestack=True):
"""Configure environment for DeepMind-style Atari.
Note that we assume reward clipping is done outside the wrapper.
Args:
dim (int): Dimension to resize observations to (dim x dim).
framestack (bool): Whether to framestack observations.
"""
env = MonitorEnv(env)
env = NoopResetEnv(env, noop_max=30)
if "NoFrameskip" in env.spec.id:
env = MaxAndSkipEnv(env, skip=4)
env = EpisodicLifeEnv(env)
if "FIRE" in env.unwrapped.get_action_meanings():
env = FireResetEnv(env)
env = WarpFrame(env, dim)
# env = ScaledFloatFrame(env) # TODO: use for dqn?
# env = ClipRewardEnv(env) # reward clipping is handled by policy eval
if framestack:
env = FrameStack(env, 4)
return env
-451
View File
@@ -1,451 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.env.external_env import ExternalEnv
from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
from ray.rllib.env.vector_env import VectorEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.utils.annotations import override, PublicAPI
ASYNC_RESET_RETURN = "async_reset_return"
@PublicAPI
class BaseEnv(object):
"""The lowest-level env interface used by RLlib for sampling.
BaseEnv models multiple agents executing asynchronously in multiple
environments. A call to poll() returns observations from ready agents
keyed by their environment and agent ids, and actions for those agents
can be sent back via send_actions().
All other env types can be adapted to BaseEnv. RLlib handles these
conversions internally in RolloutWorker, for example:
gym.Env => rllib.VectorEnv => rllib.BaseEnv
rllib.MultiAgentEnv => rllib.BaseEnv
rllib.ExternalEnv => rllib.BaseEnv
Attributes:
action_space (gym.Space): Action space. This must be defined for
single-agent envs. Multi-agent envs can set this to None.
observation_space (gym.Space): Observation space. This must be defined
for single-agent envs. Multi-agent envs can set this to None.
Examples:
>>> env = MyBaseEnv()
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
>>> print(obs)
{
"env_0": {
"car_0": [2.4, 1.6],
"car_1": [3.4, -3.2],
},
"env_1": {
"car_0": [8.0, 4.1],
},
"env_2": {
"car_0": [2.3, 3.3],
"car_1": [1.4, -0.2],
"car_3": [1.2, 0.1],
},
}
>>> env.send_actions(
actions={
"env_0": {
"car_0": 0,
"car_1": 1,
}, ...
})
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
>>> print(obs)
{
"env_0": {
"car_0": [4.1, 1.7],
"car_1": [3.2, -4.2],
}, ...
}
>>> print(dones)
{
"env_0": {
"__all__": False,
"car_0": False,
"car_1": True,
}, ...
}
"""
@staticmethod
def to_base_env(env,
make_env=None,
num_envs=1,
remote_envs=False,
remote_env_batch_wait_ms=0):
"""Wraps any env type as needed to expose the async interface."""
from ray.rllib.env.remote_vector_env import RemoteVectorEnv
if remote_envs and num_envs == 1:
raise ValueError(
"Remote envs only make sense to use if num_envs > 1 "
"(i.e. vectorization is enabled).")
if not isinstance(env, BaseEnv):
if isinstance(env, MultiAgentEnv):
if remote_envs:
env = RemoteVectorEnv(
make_env,
num_envs,
multiagent=True,
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
else:
env = _MultiAgentEnvToBaseEnv(
make_env=make_env,
existing_envs=[env],
num_envs=num_envs)
elif isinstance(env, ExternalMultiAgentEnv):
if num_envs != 1:
raise ValueError(
"ExternalMultiAgentEnv does not currently support "
"num_envs > 1.")
env = _ExternalEnvToBaseEnv(env, multiagent=True)
elif isinstance(env, ExternalEnv):
if num_envs != 1:
raise ValueError(
"ExternalEnv does not currently support num_envs > 1.")
env = _ExternalEnvToBaseEnv(env)
elif isinstance(env, VectorEnv):
env = _VectorEnvToBaseEnv(env)
else:
if remote_envs:
env = RemoteVectorEnv(
make_env,
num_envs,
multiagent=False,
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
else:
env = VectorEnv.wrap(
make_env=make_env,
existing_envs=[env],
num_envs=num_envs,
action_space=env.action_space,
observation_space=env.observation_space)
env = _VectorEnvToBaseEnv(env)
assert isinstance(env, BaseEnv), env
return env
@PublicAPI
def poll(self):
"""Returns observations from ready agents.
The returns are two-level dicts mapping from env_id to a dict of
agent_id to values. The number of agents and envs can vary over time.
Returns
-------
obs (dict): New observations for each ready agent.
rewards (dict): Reward values for each ready agent. If the
episode is just started, the value will be None.
dones (dict): Done values for each ready agent. The special key
"__all__" is used to indicate env termination.
infos (dict): Info values for each ready agent.
off_policy_actions (dict): Agents may take off-policy actions. When
that happens, there will be an entry in this dict that contains
the taken action. There is no need to send_actions() for agents
that have already chosen off-policy actions.
"""
raise NotImplementedError
@PublicAPI
def send_actions(self, action_dict):
"""Called to send actions back to running agents in this env.
Actions should be sent for each ready agent that returned observations
in the previous poll() call.
Arguments:
action_dict (dict): Actions values keyed by env_id and agent_id.
"""
raise NotImplementedError
@PublicAPI
def try_reset(self, env_id):
"""Attempt to reset the env with the given id.
If the environment does not support synchronous reset, None can be
returned here.
Returns:
obs (dict|None): Resetted observation or None if not supported.
"""
return None
@PublicAPI
def get_unwrapped(self):
"""Return a reference to the underlying gym envs, if any.
Returns:
envs (list): Underlying gym envs or [].
"""
return []
@PublicAPI
def stop(self):
"""Releases all resources used."""
for env in self.get_unwrapped():
if hasattr(env, "close"):
env.close()
# Fixed agent identifier when there is only the single agent in the env
_DUMMY_AGENT_ID = "agent0"
def _with_dummy_agent_id(env_id_to_values, dummy_id=_DUMMY_AGENT_ID):
return {k: {dummy_id: v} for (k, v) in env_id_to_values.items()}
class _ExternalEnvToBaseEnv(BaseEnv):
"""Internal adapter of ExternalEnv to BaseEnv."""
def __init__(self, external_env, preprocessor=None, multiagent=False):
self.external_env = external_env
self.prep = preprocessor
self.multiagent = multiagent
self.action_space = external_env.action_space
if preprocessor:
self.observation_space = preprocessor.observation_space
else:
self.observation_space = external_env.observation_space
external_env.start()
@override(BaseEnv)
def poll(self):
with self.external_env._results_avail_condition:
results = self._poll()
while len(results[0]) == 0:
self.external_env._results_avail_condition.wait()
results = self._poll()
if not self.external_env.isAlive():
raise Exception("Serving thread has stopped.")
limit = self.external_env._max_concurrent_episodes
assert len(results[0]) < limit, \
("Too many concurrent episodes, were some leaked? This "
"ExternalEnv was created with max_concurrent={}".format(limit))
return results
@override(BaseEnv)
def send_actions(self, action_dict):
if self.multiagent:
for env_id, actions in action_dict.items():
self.external_env._episodes[env_id].action_queue.put(actions)
else:
for env_id, action in action_dict.items():
self.external_env._episodes[env_id].action_queue.put(
action[_DUMMY_AGENT_ID])
def _poll(self):
all_obs, all_rewards, all_dones, all_infos = {}, {}, {}, {}
off_policy_actions = {}
for eid, episode in self.external_env._episodes.copy().items():
data = episode.get_data()
cur_done = episode.cur_done_dict[
"__all__"] if self.multiagent else episode.cur_done
if cur_done:
del self.external_env._episodes[eid]
if data:
if self.prep:
all_obs[eid] = self.prep.transform(data["obs"])
else:
all_obs[eid] = data["obs"]
all_rewards[eid] = data["reward"]
all_dones[eid] = data["done"]
all_infos[eid] = data["info"]
if "off_policy_action" in data:
off_policy_actions[eid] = data["off_policy_action"]
if self.multiagent:
# ensure a consistent set of keys
# rely on all_obs having all possible keys for now
for eid, eid_dict in all_obs.items():
for agent_id in eid_dict.keys():
def fix(d, zero_val):
if agent_id not in d[eid]:
d[eid][agent_id] = zero_val
fix(all_rewards, 0.0)
fix(all_dones, False)
fix(all_infos, {})
return (all_obs, all_rewards, all_dones, all_infos,
off_policy_actions)
else:
return _with_dummy_agent_id(all_obs), \
_with_dummy_agent_id(all_rewards), \
_with_dummy_agent_id(all_dones, "__all__"), \
_with_dummy_agent_id(all_infos), \
_with_dummy_agent_id(off_policy_actions)
class _VectorEnvToBaseEnv(BaseEnv):
"""Internal adapter of VectorEnv to BaseEnv.
We assume the caller will always send the full vector of actions in each
call to send_actions(), and that they call reset_at() on all completed
environments before calling send_actions().
"""
def __init__(self, vector_env):
self.vector_env = vector_env
self.action_space = vector_env.action_space
self.observation_space = vector_env.observation_space
self.num_envs = vector_env.num_envs
self.new_obs = None # lazily initialized
self.cur_rewards = [None for _ in range(self.num_envs)]
self.cur_dones = [False for _ in range(self.num_envs)]
self.cur_infos = [None for _ in range(self.num_envs)]
@override(BaseEnv)
def poll(self):
if self.new_obs is None:
self.new_obs = self.vector_env.vector_reset()
new_obs = dict(enumerate(self.new_obs))
rewards = dict(enumerate(self.cur_rewards))
dones = dict(enumerate(self.cur_dones))
infos = dict(enumerate(self.cur_infos))
self.new_obs = []
self.cur_rewards = []
self.cur_dones = []
self.cur_infos = []
return _with_dummy_agent_id(new_obs), \
_with_dummy_agent_id(rewards), \
_with_dummy_agent_id(dones, "__all__"), \
_with_dummy_agent_id(infos), {}
@override(BaseEnv)
def send_actions(self, action_dict):
action_vector = [None] * self.num_envs
for i in range(self.num_envs):
action_vector[i] = action_dict[i][_DUMMY_AGENT_ID]
self.new_obs, self.cur_rewards, self.cur_dones, self.cur_infos = \
self.vector_env.vector_step(action_vector)
@override(BaseEnv)
def try_reset(self, env_id):
return {_DUMMY_AGENT_ID: self.vector_env.reset_at(env_id)}
@override(BaseEnv)
def get_unwrapped(self):
return self.vector_env.get_unwrapped()
class _MultiAgentEnvToBaseEnv(BaseEnv):
"""Internal adapter of MultiAgentEnv to BaseEnv.
This also supports vectorization if num_envs > 1.
"""
def __init__(self, make_env, existing_envs, num_envs):
"""Wrap existing multi-agent envs.
Arguments:
make_env (func|None): Factory that produces a new multiagent env.
Must be defined if the number of existing envs is less than
num_envs.
existing_envs (list): List of existing multiagent envs.
num_envs (int): Desired num multiagent envs to keep total.
"""
self.make_env = make_env
self.envs = existing_envs
self.num_envs = num_envs
self.dones = set()
while len(self.envs) < self.num_envs:
self.envs.append(self.make_env(len(self.envs)))
for env in self.envs:
assert isinstance(env, MultiAgentEnv)
self.env_states = [_MultiAgentEnvState(env) for env in self.envs]
@override(BaseEnv)
def poll(self):
obs, rewards, dones, infos = {}, {}, {}, {}
for i, env_state in enumerate(self.env_states):
obs[i], rewards[i], dones[i], infos[i] = env_state.poll()
return obs, rewards, dones, infos, {}
@override(BaseEnv)
def send_actions(self, action_dict):
for env_id, agent_dict in action_dict.items():
if env_id in self.dones:
raise ValueError("Env {} is already done".format(env_id))
env = self.envs[env_id]
obs, rewards, dones, infos = env.step(agent_dict)
assert isinstance(obs, dict), "Not a multi-agent obs"
assert isinstance(rewards, dict), "Not a multi-agent reward"
assert isinstance(dones, dict), "Not a multi-agent return"
assert isinstance(infos, dict), "Not a multi-agent info"
if set(obs.keys()) != set(rewards.keys()):
raise ValueError(
"Key set for obs and rewards must be the same: "
"{} vs {}".format(obs.keys(), rewards.keys()))
if set(infos).difference(set(obs)):
raise ValueError("Key set for infos must be a subset of obs: "
"{} vs {}".format(infos.keys(), obs.keys()))
if "__all__" not in dones:
raise ValueError(
"In multi-agent environments, '__all__': True|False must "
"be included in the 'done' dict: got {}.".format(dones))
if dones["__all__"]:
self.dones.add(env_id)
self.env_states[env_id].observe(obs, rewards, dones, infos)
@override(BaseEnv)
def try_reset(self, env_id):
obs = self.env_states[env_id].reset()
assert isinstance(obs, dict), "Not a multi-agent obs"
if obs is not None and env_id in self.dones:
self.dones.remove(env_id)
return obs
@override(BaseEnv)
def get_unwrapped(self):
return [state.env for state in self.env_states]
class _MultiAgentEnvState(object):
def __init__(self, env):
assert isinstance(env, MultiAgentEnv)
self.env = env
self.initialized = False
def poll(self):
if not self.initialized:
self.reset()
self.initialized = True
obs, rew, dones, info = (self.last_obs, self.last_rewards,
self.last_dones, self.last_infos)
self.last_obs = {}
self.last_rewards = {}
self.last_dones = {"__all__": False}
self.last_infos = {}
return obs, rew, dones, info
def observe(self, obs, rewards, dones, infos):
self.last_obs = obs
self.last_rewards = rewards
self.last_dones = dones
self.last_infos = infos
def reset(self):
self.last_obs = self.env.reset()
self.last_rewards = {
agent_id: None
for agent_id in self.last_obs.keys()
}
self.last_dones = {
agent_id: False
for agent_id in self.last_obs.keys()
}
self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
self.last_dones["__all__"] = False
return self.last_obs
-19
View File
@@ -1,19 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
# info key for the individual rewards of an agent, for example:
# info: {
# group_1: {
# _group_rewards: [5, -1, 1], # 3 agents in this group
# }
# }
GROUP_REWARDS = "_group_rewards"
# info key for the individual infos of an agent, for example:
# info: {
# group_1: {
# _group_infos: [{"foo": ...}, {}], # 2 agents in this group
# }
# }
GROUP_INFO = "_group_info"
-42
View File
@@ -1,42 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.utils.annotations import PublicAPI
@PublicAPI
class EnvContext(dict):
"""Wraps env configurations to include extra rllib metadata.
These attributes can be used to parameterize environments per process.
For example, one might use `worker_index` to control which data file an
environment reads in on initialization.
RLlib auto-sets these attributes when constructing registered envs.
Attributes:
worker_index (int): When there are multiple workers created, this
uniquely identifies the worker the env is created in.
vector_index (int): When there are multiple envs per worker, this
uniquely identifies the env index within the worker.
remote (bool): Whether environment should be remote or not.
"""
def __init__(self, env_config, worker_index, vector_index=0, remote=False):
dict.__init__(self, env_config)
self.worker_index = worker_index
self.vector_index = vector_index
self.remote = remote
def copy_with_overrides(self,
env_config=None,
worker_index=None,
vector_index=None,
remote=None):
return EnvContext(
env_config if env_config is not None else self,
worker_index if worker_index is not None else self.worker_index,
vector_index if vector_index is not None else self.vector_index,
remote if remote is not None else self.remote,
)
-272
View File
@@ -1,272 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from six.moves import queue
import threading
import uuid
from ray.rllib.utils.annotations import PublicAPI
@PublicAPI
class ExternalEnv(threading.Thread):
"""An environment that interfaces with external agents.
Unlike simulator envs, control is inverted. The environment queries the
policy to obtain actions and logs observations and rewards for training.
This is in contrast to gym.Env, where the algorithm drives the simulation
through env.step() calls.
You can use ExternalEnv as the backend for policy serving (by serving HTTP
requests in the run loop), for ingesting offline logs data (by reading
offline transitions in the run loop), or other custom use cases not easily
expressed through gym.Env.
ExternalEnv supports both on-policy actions (through self.get_action()),
and off-policy actions (through self.log_action()).
This env is thread-safe, but individual episodes must be executed serially.
Attributes:
action_space (gym.Space): Action space.
observation_space (gym.Space): Observation space.
Examples:
>>> register_env("my_env", lambda config: YourExternalEnv(config))
>>> trainer = DQNTrainer(env="my_env")
>>> while True:
print(trainer.train())
"""
@PublicAPI
def __init__(self, action_space, observation_space, max_concurrent=100):
"""Initialize an external env.
ExternalEnv subclasses must call this during their __init__.
Arguments:
action_space (gym.Space): Action space of the env.
observation_space (gym.Space): Observation space of the env.
max_concurrent (int): Max number of active episodes to allow at
once. Exceeding this limit raises an error.
"""
threading.Thread.__init__(self)
self.daemon = True
self.action_space = action_space
self.observation_space = observation_space
self._episodes = {}
self._finished = set()
self._results_avail_condition = threading.Condition()
self._max_concurrent_episodes = max_concurrent
@PublicAPI
def run(self):
"""Override this to implement the run loop.
Your loop should continuously:
1. Call self.start_episode(episode_id)
2. Call self.get_action(episode_id, obs)
-or-
self.log_action(episode_id, obs, action)
3. Call self.log_returns(episode_id, reward)
4. Call self.end_episode(episode_id, obs)
5. Wait if nothing to do.
Multiple episodes may be started at the same time.
"""
raise NotImplementedError
@PublicAPI
def start_episode(self, episode_id=None, training_enabled=True):
"""Record the start of an episode.
Arguments:
episode_id (str): Unique string id for the episode or None for
it to be auto-assigned.
training_enabled (bool): Whether to use experiences for this
episode to improve the policy.
Returns:
episode_id (str): Unique string id for the episode.
"""
if episode_id is None:
episode_id = uuid.uuid4().hex
if episode_id in self._finished:
raise ValueError(
"Episode {} has already completed.".format(episode_id))
if episode_id in self._episodes:
raise ValueError(
"Episode {} is already started".format(episode_id))
self._episodes[episode_id] = _ExternalEnvEpisode(
episode_id, self._results_avail_condition, training_enabled)
return episode_id
@PublicAPI
def get_action(self, episode_id, observation):
"""Record an observation and get the on-policy action.
Arguments:
episode_id (str): Episode id returned from start_episode().
observation (obj): Current environment observation.
Returns:
action (obj): Action from the env action space.
"""
episode = self._get(episode_id)
return episode.wait_for_action(observation)
@PublicAPI
def log_action(self, episode_id, observation, action):
"""Record an observation and (off-policy) action taken.
Arguments:
episode_id (str): Episode id returned from start_episode().
observation (obj): Current environment observation.
action (obj): Action for the observation.
"""
episode = self._get(episode_id)
episode.log_action(observation, action)
@PublicAPI
def log_returns(self, episode_id, reward, info=None):
"""Record returns from the environment.
The reward will be attributed to the previous action taken by the
episode. Rewards accumulate until the next action. If no reward is
logged before the next action, a reward of 0.0 is assumed.
Arguments:
episode_id (str): Episode id returned from start_episode().
reward (float): Reward from the environment.
info (dict): Optional info dict.
"""
episode = self._get(episode_id)
episode.cur_reward += reward
if info:
episode.cur_info = info or {}
@PublicAPI
def end_episode(self, episode_id, observation):
"""Record the end of an episode.
Arguments:
episode_id (str): Episode id returned from start_episode().
observation (obj): Current environment observation.
"""
episode = self._get(episode_id)
self._finished.add(episode.episode_id)
episode.done(observation)
def _get(self, episode_id):
"""Get a started episode or raise an error."""
if episode_id in self._finished:
raise ValueError(
"Episode {} has already completed.".format(episode_id))
if episode_id not in self._episodes:
raise ValueError("Episode {} not found.".format(episode_id))
return self._episodes[episode_id]
class _ExternalEnvEpisode(object):
"""Tracked state for each active episode."""
def __init__(self,
episode_id,
results_avail_condition,
training_enabled,
multiagent=False):
self.episode_id = episode_id
self.results_avail_condition = results_avail_condition
self.training_enabled = training_enabled
self.multiagent = multiagent
self.data_queue = queue.Queue()
self.action_queue = queue.Queue()
if multiagent:
self.new_observation_dict = None
self.new_action_dict = None
self.cur_reward_dict = {}
self.cur_done_dict = {"__all__": False}
self.cur_info_dict = {}
else:
self.new_observation = None
self.new_action = None
self.cur_reward = 0.0
self.cur_done = False
self.cur_info = {}
def get_data(self):
if self.data_queue.empty():
return None
return self.data_queue.get_nowait()
def log_action(self, observation, action):
if self.multiagent:
self.new_observation_dict = observation
self.new_action_dict = action
else:
self.new_observation = observation
self.new_action = action
self._send()
self.action_queue.get(True, timeout=60.0)
def wait_for_action(self, observation):
if self.multiagent:
self.new_observation_dict = observation
else:
self.new_observation = observation
self._send()
return self.action_queue.get(True, timeout=60.0)
def done(self, observation):
if self.multiagent:
self.new_observation_dict = observation
self.cur_done_dict = {"__all__": True}
else:
self.new_observation = observation
self.cur_done = True
self._send()
def _send(self):
if self.multiagent:
item = {
"obs": self.new_observation_dict,
"reward": self.cur_reward_dict,
"done": self.cur_done_dict,
"info": self.cur_info_dict,
}
if self.new_action_dict is not None:
item["off_policy_action"] = self.new_action_dict
self.new_observation_dict = None
self.new_action_dict = None
self.cur_reward_dict = {}
else:
item = {
"obs": self.new_observation,
"reward": self.cur_reward,
"done": self.cur_done,
"info": self.cur_info,
}
if self.new_action is not None:
item["off_policy_action"] = self.new_action
self.new_observation = None
self.new_action = None
self.cur_reward = 0.0
if not self.training_enabled:
item["info"]["training_enabled"] = False
with self.results_avail_condition:
self.data_queue.put_nowait(item)
self.results_avail_condition.notify()
-149
View File
@@ -1,149 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import uuid
from ray.rllib.utils.annotations import override, PublicAPI
from ray.rllib.env.external_env import ExternalEnv, _ExternalEnvEpisode
@PublicAPI
class ExternalMultiAgentEnv(ExternalEnv):
"""This is the multi-agent version of ExternalEnv."""
@PublicAPI
def __init__(self, action_space, observation_space, max_concurrent=100):
"""Initialize a multi-agent external env.
ExternalMultiAgentEnv subclasses must call this during their __init__.
Arguments:
action_space (gym.Space): Action space of the env.
observation_space (gym.Space): Observation space of the env.
max_concurrent (int): Max number of active episodes to allow at
once. Exceeding this limit raises an error.
"""
ExternalEnv.__init__(self, action_space, observation_space,
max_concurrent)
# we require to know all agents' spaces
if isinstance(self.action_space, dict) or isinstance(
self.observation_space, dict):
if not (self.action_space.keys() == self.observation_space.keys()):
raise ValueError("Agent ids disagree for action space and obs "
"space dict: {} {}".format(
self.action_space.keys(),
self.observation_space.keys()))
@PublicAPI
def run(self):
"""Override this to implement the multi-agent run loop.
Your loop should continuously:
1. Call self.start_episode(episode_id)
2. Call self.get_action(episode_id, obs_dict)
-or-
self.log_action(episode_id, obs_dict, action_dict)
3. Call self.log_returns(episode_id, reward_dict)
4. Call self.end_episode(episode_id, obs_dict)
5. Wait if nothing to do.
Multiple episodes may be started at the same time.
"""
raise NotImplementedError
@PublicAPI
@override(ExternalEnv)
def start_episode(self, episode_id=None, training_enabled=True):
if episode_id is None:
episode_id = uuid.uuid4().hex
if episode_id in self._finished:
raise ValueError(
"Episode {} has already completed.".format(episode_id))
if episode_id in self._episodes:
raise ValueError(
"Episode {} is already started".format(episode_id))
self._episodes[episode_id] = _ExternalEnvEpisode(
episode_id,
self._results_avail_condition,
training_enabled,
multiagent=True)
return episode_id
@PublicAPI
@override(ExternalEnv)
def get_action(self, episode_id, observation_dict):
"""Record an observation and get the on-policy action.
observation_dict is expected to contain the observation
of all agents acting in this episode step.
Arguments:
episode_id (str): Episode id returned from start_episode().
observation_dict (dict): Current environment observation.
Returns:
action (dict): Action from the env action space.
"""
episode = self._get(episode_id)
return episode.wait_for_action(observation_dict)
@PublicAPI
@override(ExternalEnv)
def log_action(self, episode_id, observation_dict, action_dict):
"""Record an observation and (off-policy) action taken.
Arguments:
episode_id (str): Episode id returned from start_episode().
observation_dict (dict): Current environment observation.
action_dict (dict): Action for the observation.
"""
episode = self._get(episode_id)
episode.log_action(observation_dict, action_dict)
@PublicAPI
@override(ExternalEnv)
def log_returns(self, episode_id, reward_dict, info_dict=None):
"""Record returns from the environment.
The reward will be attributed to the previous action taken by the
episode. Rewards accumulate until the next action. If no reward is
logged before the next action, a reward of 0.0 is assumed.
Arguments:
episode_id (str): Episode id returned from start_episode().
reward_dict (dict): Reward from the environment agents.
info (dict): Optional info dict.
"""
episode = self._get(episode_id)
# accumulate reward by agent
# for existing agents, we want to add the reward up
for agent, rew in reward_dict.items():
if agent in episode.cur_reward_dict:
episode.cur_reward_dict[agent] += rew
else:
episode.cur_reward_dict[agent] = rew
if info_dict:
episode.cur_info_dict = info_dict or {}
@PublicAPI
@override(ExternalEnv)
def end_episode(self, episode_id, observation_dict):
"""Record the end of an episode.
Arguments:
episode_id (str): Episode id returned from start_episode().
observation_dict (dict): Current environment observation.
"""
episode = self._get(episode_id)
self._finished.add(episode.episode_id)
episode.done(observation_dict)
-107
View File
@@ -1,107 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import OrderedDict
from ray.rllib.env.constants import GROUP_REWARDS, GROUP_INFO
from ray.rllib.env.multi_agent_env import MultiAgentEnv
# TODO(ekl) we should add some unit tests for this
class _GroupAgentsWrapper(MultiAgentEnv):
"""Wraps a MultiAgentEnv environment with agents grouped as specified.
See multi_agent_env.py for the specification of groups.
This API is experimental.
"""
def __init__(self, env, groups, obs_space=None, act_space=None):
"""Wrap an existing multi-agent env to group agents together.
See MultiAgentEnv.with_agent_groups() for usage info.
Arguments:
env (MultiAgentEnv): env to wrap
groups (dict): Grouping spec as documented in MultiAgentEnv
obs_space (Space): Optional observation space for the grouped
env. Must be a tuple space.
act_space (Space): Optional action space for the grouped env.
Must be a tuple space.
"""
self.env = env
self.groups = groups
self.agent_id_to_group = {}
for group_id, agent_ids in groups.items():
for agent_id in agent_ids:
if agent_id in self.agent_id_to_group:
raise ValueError(
"Agent id {} is in multiple groups".format(
agent_id, groups))
self.agent_id_to_group[agent_id] = group_id
if obs_space is not None:
self.observation_space = obs_space
if act_space is not None:
self.action_space = act_space
def reset(self):
obs = self.env.reset()
return self._group_items(obs)
def step(self, action_dict):
# Ungroup and send actions
action_dict = self._ungroup_items(action_dict)
obs, rewards, dones, infos = self.env.step(action_dict)
# Apply grouping transforms to the env outputs
obs = self._group_items(obs)
rewards = self._group_items(
rewards, agg_fn=lambda gvals: list(gvals.values()))
dones = self._group_items(
dones, agg_fn=lambda gvals: all(gvals.values()))
infos = self._group_items(
infos, agg_fn=lambda gvals: {GROUP_INFO: list(gvals.values())})
# Aggregate rewards, but preserve the original values in infos
for agent_id, rew in rewards.items():
if isinstance(rew, list):
rewards[agent_id] = sum(rew)
if agent_id not in infos:
infos[agent_id] = {}
infos[agent_id][GROUP_REWARDS] = rew
return obs, rewards, dones, infos
def _ungroup_items(self, items):
out = {}
for agent_id, value in items.items():
if agent_id in self.groups:
assert len(value) == len(self.groups[agent_id]), \
(agent_id, value, self.groups)
for a, v in zip(self.groups[agent_id], value):
out[a] = v
else:
out[agent_id] = value
return out
def _group_items(self, items, agg_fn=lambda gvals: list(gvals.values())):
grouped_items = {}
for agent_id, item in items.items():
if agent_id in self.agent_id_to_group:
group_id = self.agent_id_to_group[agent_id]
if group_id in grouped_items:
continue # already added
group_out = OrderedDict()
for a in self.groups[group_id]:
if a in items:
group_out[a] = items[a]
else:
raise ValueError(
"Missing member of group {}: {}: {}".format(
group_id, a, items))
grouped_items[group_id] = agg_fn(group_out)
else:
grouped_items[agent_id] = item
return grouped_items
-114
View File
@@ -1,114 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.utils.annotations import PublicAPI
@PublicAPI
class MultiAgentEnv(object):
"""An environment that hosts multiple independent agents.
Agents are identified by (string) agent ids. Note that these "agents" here
are not to be confused with RLlib agents.
Examples:
>>> env = MyMultiAgentEnv()
>>> obs = env.reset()
>>> print(obs)
{
"car_0": [2.4, 1.6],
"car_1": [3.4, -3.2],
"traffic_light_1": [0, 3, 5, 1],
}
>>> obs, rewards, dones, infos = env.step(
action_dict={
"car_0": 1, "car_1": 0, "traffic_light_1": 2,
})
>>> print(rewards)
{
"car_0": 3,
"car_1": -1,
"traffic_light_1": 0,
}
>>> print(dones)
{
"car_0": False, # car_0 is still running
"car_1": True, # car_1 is done
"__all__": False, # the env is not done
}
>>> print(infos)
{
"car_0": {}, # info for car_0
"car_1": {}, # info for car_1
}
"""
@PublicAPI
def reset(self):
"""Resets the env and returns observations from ready agents.
Returns:
obs (dict): New observations for each ready agent.
"""
raise NotImplementedError
@PublicAPI
def step(self, action_dict):
"""Returns observations from ready agents.
The returns are dicts mapping from agent_id strings to values. The
number of agents in the env can vary over time.
Returns
-------
obs (dict): New observations for each ready agent.
rewards (dict): Reward values for each ready agent. If the
episode is just started, the value will be None.
dones (dict): Done values for each ready agent. The special key
"__all__" (required) is used to indicate env termination.
infos (dict): Optional info values for each agent id.
"""
raise NotImplementedError
# yapf: disable
# __grouping_doc_begin__
@PublicAPI
def with_agent_groups(self, groups, obs_space=None, act_space=None):
"""Convenience method for grouping together agents in this env.
An agent group is a list of agent ids that are mapped to a single
logical agent. All agents of the group must act at the same time in the
environment. The grouped agent exposes Tuple action and observation
spaces that are the concatenated action and obs spaces of the
individual agents.
The rewards of all the agents in a group are summed. The individual
agent rewards are available under the "individual_rewards" key of the
group info return.
Agent grouping is required to leverage algorithms such as Q-Mix.
This API is experimental.
Arguments:
groups (dict): Mapping from group id to a list of the agent ids
of group members. If an agent id is not present in any group
value, it will be left ungrouped.
obs_space (Space): Optional observation space for the grouped
env. Must be a tuple space.
act_space (Space): Optional action space for the grouped env.
Must be a tuple space.
Examples:
>>> env = YourMultiAgentEnv(...)
>>> grouped_env = env.with_agent_groups(env, {
... "group1": ["agent1", "agent2", "agent3"],
... "group2": ["agent4", "agent5"],
... })
"""
from ray.rllib.env.group_agents_wrapper import _GroupAgentsWrapper
return _GroupAgentsWrapper(self, groups, obs_space, act_space)
# __grouping_doc_end__
# yapf: enable
-130
View File
@@ -1,130 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import ray
from ray.rllib.env.base_env import BaseEnv, _DUMMY_AGENT_ID, ASYNC_RESET_RETURN
from ray.rllib.utils.memory import ray_get_and_free
logger = logging.getLogger(__name__)
class RemoteVectorEnv(BaseEnv):
"""Vector env that executes envs in remote workers.
This provides dynamic batching of inference as observations are returned
from the remote simulator actors. Both single and multi-agent child envs
are supported, and envs can be stepped synchronously or async.
"""
def __init__(self, make_env, num_envs, multiagent,
remote_env_batch_wait_ms):
self.make_local_env = make_env
self.num_envs = num_envs
self.multiagent = multiagent
self.poll_timeout = remote_env_batch_wait_ms / 1000
self.actors = None # lazy init
self.pending = None # lazy init
def poll(self):
if self.actors is None:
def make_remote_env(i):
logger.info("Launching env {} in remote actor".format(i))
if self.multiagent:
return _RemoteMultiAgentEnv.remote(self.make_local_env, i)
else:
return _RemoteSingleAgentEnv.remote(self.make_local_env, i)
self.actors = [make_remote_env(i) for i in range(self.num_envs)]
if self.pending is None:
self.pending = {a.reset.remote(): a for a in self.actors}
# each keyed by env_id in [0, num_remote_envs)
obs, rewards, dones, infos = {}, {}, {}, {}
ready = []
# Wait for at least 1 env to be ready here
while not ready:
ready, _ = ray.wait(
list(self.pending),
num_returns=len(self.pending),
timeout=self.poll_timeout)
# Get and return observations for each of the ready envs
env_ids = set()
for obj_id in ready:
actor = self.pending.pop(obj_id)
env_id = self.actors.index(actor)
env_ids.add(env_id)
ob, rew, done, info = ray_get_and_free(obj_id)
obs[env_id] = ob
rewards[env_id] = rew
dones[env_id] = done
infos[env_id] = info
logger.debug("Got obs batch for actors {}".format(env_ids))
return obs, rewards, dones, infos, {}
def send_actions(self, action_dict):
for env_id, actions in action_dict.items():
actor = self.actors[env_id]
obj_id = actor.step.remote(actions)
self.pending[obj_id] = actor
def try_reset(self, env_id):
actor = self.actors[env_id]
obj_id = actor.reset.remote()
self.pending[obj_id] = actor
return ASYNC_RESET_RETURN
def stop(self):
if self.actors is not None:
for actor in self.actors:
actor.__ray_terminate__.remote()
@ray.remote(num_cpus=0)
class _RemoteMultiAgentEnv(object):
"""Wrapper class for making a multi-agent env a remote actor."""
def __init__(self, make_env, i):
self.env = make_env(i)
def reset(self):
obs = self.env.reset()
# each keyed by agent_id in the env
rew = {agent_id: 0 for agent_id in obs.keys()}
info = {agent_id: {} for agent_id in obs.keys()}
done = {"__all__": False}
return obs, rew, done, info
def step(self, action_dict):
return self.env.step(action_dict)
@ray.remote(num_cpus=0)
class _RemoteSingleAgentEnv(object):
"""Wrapper class for making a gym env a remote actor."""
def __init__(self, make_env, i):
self.env = make_env(i)
def reset(self):
obs = {_DUMMY_AGENT_ID: self.env.reset()}
rew = {agent_id: 0 for agent_id in obs.keys()}
info = {agent_id: {} for agent_id in obs.keys()}
done = {"__all__": False}
return obs, rew, done, info
def step(self, action):
obs, rew, done, info = self.env.step(action[_DUMMY_AGENT_ID])
obs, rew, done, info = [{
_DUMMY_AGENT_ID: x
} for x in [obs, rew, done, info]]
done["__all__"] = done[_DUMMY_AGENT_ID]
return obs, rew, done, info
-8
View File
@@ -1,8 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.env.external_env import ExternalEnv
# renamed to ExternalEnv in 0.6
ServingEnv = ExternalEnv
-126
View File
@@ -1,126 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
from ray.rllib.utils.annotations import override, PublicAPI
logger = logging.getLogger(__name__)
@PublicAPI
class VectorEnv(object):
"""An environment that supports batch evaluation.
Subclasses must define the following attributes:
Attributes:
action_space (gym.Space): Action space of individual envs.
observation_space (gym.Space): Observation space of individual envs.
num_envs (int): Number of envs in this vector env.
"""
@staticmethod
def wrap(make_env=None,
existing_envs=None,
num_envs=1,
action_space=None,
observation_space=None):
return _VectorizedGymEnv(make_env, existing_envs or [], num_envs,
action_space, observation_space)
@PublicAPI
def vector_reset(self):
"""Resets all environments.
Returns:
obs (list): Vector of observations from each environment.
"""
raise NotImplementedError
@PublicAPI
def reset_at(self, index):
"""Resets a single environment.
Returns:
obs (obj): Observations from the resetted environment.
"""
raise NotImplementedError
@PublicAPI
def vector_step(self, actions):
"""Vectorized step.
Arguments:
actions (list): Actions for each env.
Returns:
obs (list): New observations for each env.
rewards (list): Reward values for each env.
dones (list): Done values for each env.
infos (list): Info values for each env.
"""
raise NotImplementedError
@PublicAPI
def get_unwrapped(self):
"""Returns the underlying env instances."""
raise NotImplementedError
class _VectorizedGymEnv(VectorEnv):
"""Internal wrapper for gym envs to implement VectorEnv.
Arguments:
make_env (func|None): Factory that produces a new gym env. Must be
defined if the number of existing envs is less than num_envs.
existing_envs (list): List of existing gym envs.
num_envs (int): Desired num gym envs to keep total.
"""
def __init__(self,
make_env,
existing_envs,
num_envs,
action_space=None,
observation_space=None):
self.make_env = make_env
self.envs = existing_envs
self.num_envs = num_envs
while len(self.envs) < self.num_envs:
self.envs.append(self.make_env(len(self.envs)))
self.action_space = action_space or self.envs[0].action_space
self.observation_space = observation_space or \
self.envs[0].observation_space
@override(VectorEnv)
def vector_reset(self):
return [e.reset() for e in self.envs]
@override(VectorEnv)
def reset_at(self, index):
return self.envs[index].reset()
@override(VectorEnv)
def vector_step(self, actions):
obs_batch, rew_batch, done_batch, info_batch = [], [], [], []
for i in range(self.num_envs):
obs, r, done, info = self.envs[i].step(actions[i])
if not np.isscalar(r) or not np.isreal(r) or not np.isfinite(r):
raise ValueError(
"Reward should be finite scalar, got {} ({})".format(
r, type(r)))
if type(info) is not dict:
raise ValueError("Info should be a dict, got {} ({})".format(
info, type(info)))
obs_batch.append(obs)
rew_batch.append(r)
done_batch.append(done)
info_batch.append(info)
return obs_batch, rew_batch, done_batch, info_batch
@override(VectorEnv)
def get_unwrapped(self):
return self.envs
-31
View File
@@ -1,31 +0,0 @@
from ray.rllib.evaluation.episode import MultiAgentEpisode
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
from ray.rllib.evaluation.interface import EvaluatorInterface
from ray.rllib.evaluation.policy_graph import PolicyGraph
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
from ray.rllib.evaluation.sample_batch_builder import (
SampleBatchBuilder, MultiAgentSampleBatchBuilder)
from ray.rllib.evaluation.sampler import SyncSampler, AsyncSampler
from ray.rllib.evaluation.postprocessing import compute_advantages
from ray.rllib.evaluation.metrics import collect_metrics
__all__ = [
"EvaluatorInterface",
"RolloutWorker",
"PolicyGraph",
"TFPolicyGraph",
"TorchPolicyGraph",
"SampleBatch",
"MultiAgentBatch",
"SampleBatchBuilder",
"MultiAgentSampleBatchBuilder",
"SyncSampler",
"AsyncSampler",
"compute_advantages",
"collect_metrics",
"MultiAgentEpisode",
"PolicyEvaluator",
]
-201
View File
@@ -1,201 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from collections import defaultdict
import random
import numpy as np
from ray.rllib.env.base_env import _DUMMY_AGENT_ID
from ray.rllib.utils.annotations import DeveloperAPI
@DeveloperAPI
class MultiAgentEpisode(object):
"""Tracks the current state of a (possibly multi-agent) episode.
Attributes:
new_batch_builder (func): Create a new MultiAgentSampleBatchBuilder.
add_extra_batch (func): Return a built MultiAgentBatch to the sampler.
batch_builder (obj): Batch builder for the current episode.
total_reward (float): Summed reward across all agents in this episode.
length (int): Length of this episode.
episode_id (int): Unique id identifying this trajectory.
agent_rewards (dict): Summed rewards broken down by agent.
custom_metrics (dict): Dict where the you can add custom metrics.
user_data (dict): Dict that you can use for temporary storage.
Use case 1: Model-based rollouts in multi-agent:
A custom compute_actions() function in a policy can inspect the
current episode state and perform a number of rollouts based on the
policies and state of other agents in the environment.
Use case 2: Returning extra rollouts data.
The model rollouts can be returned back to the sampler by calling:
>>> batch = episode.new_batch_builder()
>>> for each transition:
batch.add_values(...) # see sampler for usage
>>> episode.extra_batches.add(batch.build_and_reset())
"""
def __init__(self, policies, policy_mapping_fn, batch_builder_factory,
extra_batch_callback):
self.new_batch_builder = batch_builder_factory
self.add_extra_batch = extra_batch_callback
self.batch_builder = batch_builder_factory()
self.total_reward = 0.0
self.length = 0
self.episode_id = random.randrange(2e9)
self.agent_rewards = defaultdict(float)
self.custom_metrics = {}
self.user_data = {}
self._policies = policies
self._policy_mapping_fn = policy_mapping_fn
self._next_agent_index = 0
self._agent_to_index = {}
self._agent_to_policy = {}
self._agent_to_rnn_state = {}
self._agent_to_last_obs = {}
self._agent_to_last_raw_obs = {}
self._agent_to_last_info = {}
self._agent_to_last_action = {}
self._agent_to_last_pi_info = {}
self._agent_to_prev_action = {}
self._agent_reward_history = defaultdict(list)
@DeveloperAPI
def soft_reset(self):
"""Clears rewards and metrics, but retains RNN and other state.
This is used to carry state across multiple logical episodes in the
same env (i.e., if `soft_horizon` is set).
"""
self.length = 0
self.episode_id = random.randrange(2e9)
self.total_reward = 0.0
self.agent_rewards = defaultdict(float)
self._agent_reward_history = defaultdict(list)
@DeveloperAPI
def policy_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the policy for the specified agent.
If the agent is new, the policy mapping fn will be called to bind the
agent to a policy for the duration of the episode.
"""
if agent_id not in self._agent_to_policy:
self._agent_to_policy[agent_id] = self._policy_mapping_fn(agent_id)
return self._agent_to_policy[agent_id]
@DeveloperAPI
def last_observation_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the last observation for the specified agent."""
return self._agent_to_last_obs.get(agent_id)
@DeveloperAPI
def last_raw_obs_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the last un-preprocessed obs for the specified agent."""
return self._agent_to_last_raw_obs.get(agent_id)
@DeveloperAPI
def last_info_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the last info for the specified agent."""
return self._agent_to_last_info.get(agent_id)
@DeveloperAPI
def last_action_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the last action for the specified agent, or zeros."""
if agent_id in self._agent_to_last_action:
return _flatten_action(self._agent_to_last_action[agent_id])
else:
policy = self._policies[self.policy_for(agent_id)]
flat = _flatten_action(policy.action_space.sample())
return np.zeros_like(flat)
@DeveloperAPI
def prev_action_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the previous action for the specified agent."""
if agent_id in self._agent_to_prev_action:
return _flatten_action(self._agent_to_prev_action[agent_id])
else:
# We're at t=0, so return all zeros.
return np.zeros_like(self.last_action_for(agent_id))
@DeveloperAPI
def prev_reward_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the previous reward for the specified agent."""
history = self._agent_reward_history[agent_id]
if len(history) >= 2:
return history[-2]
else:
# We're at t=0, so there is no previous reward, just return zero.
return 0.0
@DeveloperAPI
def rnn_state_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the last RNN state for the specified agent."""
if agent_id not in self._agent_to_rnn_state:
policy = self._policies[self.policy_for(agent_id)]
self._agent_to_rnn_state[agent_id] = policy.get_initial_state()
return self._agent_to_rnn_state[agent_id]
@DeveloperAPI
def last_pi_info_for(self, agent_id=_DUMMY_AGENT_ID):
"""Returns the last info object for the specified agent."""
return self._agent_to_last_pi_info[agent_id]
def _add_agent_rewards(self, reward_dict):
for agent_id, reward in reward_dict.items():
if reward is not None:
self.agent_rewards[agent_id,
self.policy_for(agent_id)] += reward
self.total_reward += reward
self._agent_reward_history[agent_id].append(reward)
def _set_rnn_state(self, agent_id, rnn_state):
self._agent_to_rnn_state[agent_id] = rnn_state
def _set_last_observation(self, agent_id, obs):
self._agent_to_last_obs[agent_id] = obs
def _set_last_raw_obs(self, agent_id, obs):
self._agent_to_last_raw_obs[agent_id] = obs
def _set_last_info(self, agent_id, info):
self._agent_to_last_info[agent_id] = info
def _set_last_action(self, agent_id, action):
if agent_id in self._agent_to_last_action:
self._agent_to_prev_action[agent_id] = \
self._agent_to_last_action[agent_id]
self._agent_to_last_action[agent_id] = action
def _set_last_pi_info(self, agent_id, pi_info):
self._agent_to_last_pi_info[agent_id] = pi_info
def _agent_index(self, agent_id):
if agent_id not in self._agent_to_index:
self._agent_to_index[agent_id] = self._next_agent_index
self._next_agent_index += 1
return self._agent_to_index[agent_id]
def _flatten_action(action):
# Concatenate tuple actions
if isinstance(action, list) or isinstance(action, tuple):
expanded = []
for a in action:
expanded.append(np.reshape(a, [-1]))
action = np.concatenate(expanded, axis=0).flatten()
return action
-128
View File
@@ -1,128 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
from ray.rllib.utils.annotations import DeveloperAPI
@DeveloperAPI
class EvaluatorInterface(object):
"""This is the interface between policy optimizers and policy evaluation.
See also: RolloutWorker
"""
@DeveloperAPI
def sample(self):
"""Returns a batch of experience sampled from this evaluator.
This method must be implemented by subclasses.
Returns:
SampleBatch|MultiAgentBatch: A columnar batch of experiences
(e.g., tensors), or a multi-agent batch.
Examples:
>>> print(ev.sample())
SampleBatch({"obs": [1, 2, 3], "action": [0, 1, 0], ...})
"""
raise NotImplementedError
@DeveloperAPI
def learn_on_batch(self, samples):
"""Update policies based on the given batch.
This is the equivalent to apply_gradients(compute_gradients(samples)),
but can be optimized to avoid pulling gradients into CPU memory.
Either this or the combination of compute/apply grads must be
implemented by subclasses.
Returns:
info: dictionary of extra metadata from compute_gradients().
Examples:
>>> batch = ev.sample()
>>> ev.learn_on_batch(samples)
"""
grads, info = self.compute_gradients(samples)
self.apply_gradients(grads)
return info
@DeveloperAPI
def compute_gradients(self, samples):
"""Returns a gradient computed w.r.t the specified samples.
Either this or learn_on_batch() must be implemented by subclasses.
Returns:
(grads, info): A list of gradients that can be applied on a
compatible evaluator. In the multi-agent case, returns a dict
of gradients keyed by policy ids. An info dictionary of
extra metadata is also returned.
Examples:
>>> batch = ev.sample()
>>> grads, info = ev2.compute_gradients(samples)
"""
raise NotImplementedError
@DeveloperAPI
def apply_gradients(self, grads):
"""Applies the given gradients to this evaluator's weights.
Either this or learn_on_batch() must be implemented by subclasses.
Examples:
>>> samples = ev1.sample()
>>> grads, info = ev2.compute_gradients(samples)
>>> ev1.apply_gradients(grads)
"""
raise NotImplementedError
@DeveloperAPI
def get_weights(self):
"""Returns the model weights of this Evaluator.
This method must be implemented by subclasses.
Returns:
object: weights that can be set on a compatible evaluator.
info: dictionary of extra metadata.
Examples:
>>> weights = ev1.get_weights()
"""
raise NotImplementedError
@DeveloperAPI
def set_weights(self, weights):
"""Sets the model weights of this Evaluator.
This method must be implemented by subclasses.
Examples:
>>> weights = ev1.get_weights()
>>> ev2.set_weights(weights)
"""
raise NotImplementedError
@DeveloperAPI
def get_host(self):
"""Returns the hostname of the process running this evaluator."""
return os.uname()[1]
@DeveloperAPI
def apply(self, func, *args):
"""Apply the given function to this evaluator instance."""
return func(self, *args)
-173
View File
@@ -1,173 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import collections
import ray
from ray.rllib.evaluation.rollout_metrics import RolloutMetrics
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
from ray.rllib.offline.off_policy_estimator import OffPolicyEstimate
from ray.rllib.policy.policy import LEARNER_STATS_KEY
from ray.rllib.utils.annotations import DeveloperAPI
from ray.rllib.utils.memory import ray_get_and_free
logger = logging.getLogger(__name__)
@DeveloperAPI
def get_learner_stats(grad_info):
"""Return optimization stats reported from the policy.
Example:
>>> grad_info = evaluator.learn_on_batch(samples)
>>> print(get_stats(grad_info))
{"vf_loss": ..., "policy_loss": ...}
"""
if LEARNER_STATS_KEY in grad_info:
return grad_info[LEARNER_STATS_KEY]
multiagent_stats = {}
for k, v in grad_info.items():
if type(v) is dict:
if LEARNER_STATS_KEY in v:
multiagent_stats[k] = v[LEARNER_STATS_KEY]
return multiagent_stats
@DeveloperAPI
def collect_metrics(local_worker=None,
remote_workers=[],
to_be_collected=[],
timeout_seconds=180):
"""Gathers episode metrics from RolloutWorker instances."""
episodes, to_be_collected = collect_episodes(
local_worker,
remote_workers,
to_be_collected,
timeout_seconds=timeout_seconds)
metrics = summarize_episodes(episodes, episodes)
return metrics
@DeveloperAPI
def collect_episodes(local_worker=None,
remote_workers=[],
to_be_collected=[],
timeout_seconds=180):
"""Gathers new episodes metrics tuples from the given evaluators."""
if remote_workers:
pending = [
a.apply.remote(lambda ev: ev.get_metrics()) for a in remote_workers
] + to_be_collected
collected, to_be_collected = ray.wait(
pending, num_returns=len(pending), timeout=timeout_seconds * 1.0)
if pending and len(collected) == 0:
logger.warning(
"WARNING: collected no metrics in {} seconds".format(
timeout_seconds))
metric_lists = ray_get_and_free(collected)
else:
metric_lists = []
if local_worker:
metric_lists.append(local_worker.get_metrics())
episodes = []
for metrics in metric_lists:
episodes.extend(metrics)
return episodes, to_be_collected
@DeveloperAPI
def summarize_episodes(episodes, new_episodes):
"""Summarizes a set of episode metrics tuples.
Arguments:
episodes: smoothed set of episodes including historical ones
new_episodes: just the new episodes in this iteration
"""
episodes, estimates = _partition(episodes)
new_episodes, _ = _partition(new_episodes)
episode_rewards = []
episode_lengths = []
policy_rewards = collections.defaultdict(list)
custom_metrics = collections.defaultdict(list)
perf_stats = collections.defaultdict(list)
for episode in episodes:
episode_lengths.append(episode.episode_length)
episode_rewards.append(episode.episode_reward)
for k, v in episode.custom_metrics.items():
custom_metrics[k].append(v)
for k, v in episode.perf_stats.items():
perf_stats[k].append(v)
for (_, policy_id), reward in episode.agent_rewards.items():
if policy_id != DEFAULT_POLICY_ID:
policy_rewards[policy_id].append(reward)
if episode_rewards:
min_reward = min(episode_rewards)
max_reward = max(episode_rewards)
else:
min_reward = float("nan")
max_reward = float("nan")
avg_reward = np.mean(episode_rewards)
avg_length = np.mean(episode_lengths)
for policy_id, rewards in policy_rewards.copy().items():
policy_rewards[policy_id] = np.mean(rewards)
for k, v_list in custom_metrics.copy().items():
custom_metrics[k + "_mean"] = np.mean(v_list)
filt = [v for v in v_list if not np.isnan(v)]
if filt:
custom_metrics[k + "_min"] = np.min(filt)
custom_metrics[k + "_max"] = np.max(filt)
else:
custom_metrics[k + "_min"] = float("nan")
custom_metrics[k + "_max"] = float("nan")
del custom_metrics[k]
for k, v_list in perf_stats.copy().items():
perf_stats[k] = np.mean(v_list)
estimators = collections.defaultdict(lambda: collections.defaultdict(list))
for e in estimates:
acc = estimators[e.estimator_name]
for k, v in e.metrics.items():
acc[k].append(v)
for name, metrics in estimators.items():
for k, v_list in metrics.items():
metrics[k] = np.mean(v_list)
estimators[name] = dict(metrics)
return dict(
episode_reward_max=max_reward,
episode_reward_min=min_reward,
episode_reward_mean=avg_reward,
episode_len_mean=avg_length,
episodes_this_iter=len(new_episodes),
policy_reward_mean=dict(policy_rewards),
custom_metrics=dict(custom_metrics),
sampler_perf=dict(perf_stats),
off_policy_estimator=dict(estimators))
def _partition(episodes):
"""Divides metrics data into true rollouts vs off-policy estimates."""
rollouts, estimates = [], []
for e in episodes:
if isinstance(e, RolloutMetrics):
rollouts.append(e)
elif isinstance(e, OffPolicyEstimate):
estimates.append(e)
else:
raise ValueError("Unknown metric type: {}".format(e))
return rollouts, estimates
@@ -1,9 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.utils import renamed_class
from ray.rllib.evaluation import RolloutWorker
PolicyEvaluator = renamed_class(
RolloutWorker, old_name="rllib.evaluation.PolicyEvaluator")
@@ -1,8 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from ray.rllib.policy.policy import Policy
from ray.rllib.utils import renamed_class
PolicyGraph = renamed_class(Policy, old_name="PolicyGraph")
@@ -1,70 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import scipy.signal
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils.annotations import DeveloperAPI
def discount(x, gamma):
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
class Postprocessing(object):
"""Constant definitions for postprocessing."""
ADVANTAGES = "advantages"
VALUE_TARGETS = "value_targets"
@DeveloperAPI
def compute_advantages(rollout, last_r, gamma=0.9, lambda_=1.0, use_gae=True):
"""Given a rollout, compute its value targets and the advantage.
Args:
rollout (SampleBatch): SampleBatch of a single trajectory
last_r (float): Value estimation for last observation
gamma (float): Discount factor.
lambda_ (float): Parameter for GAE
use_gae (bool): Using Generalized Advantage Estamation
Returns:
SampleBatch (SampleBatch): Object with experience from rollout and
processed rewards.
"""
traj = {}
trajsize = len(rollout[SampleBatch.ACTIONS])
for key in rollout:
traj[key] = np.stack(rollout[key])
if use_gae:
assert SampleBatch.VF_PREDS in rollout, "Values not found!"
vpred_t = np.concatenate(
[rollout[SampleBatch.VF_PREDS],
np.array([last_r])])
delta_t = (
traj[SampleBatch.REWARDS] + gamma * vpred_t[1:] - vpred_t[:-1])
# This formula for the advantage comes
# "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
traj[Postprocessing.ADVANTAGES] = discount(delta_t, gamma * lambda_)
traj[Postprocessing.VALUE_TARGETS] = (
traj[Postprocessing.ADVANTAGES] +
traj[SampleBatch.VF_PREDS]).copy().astype(np.float32)
else:
rewards_plus_v = np.concatenate(
[rollout[SampleBatch.REWARDS],
np.array([last_r])])
traj[Postprocessing.ADVANTAGES] = discount(rewards_plus_v, gamma)[:-1]
# TODO(ekl): support using a critic without GAE
traj[Postprocessing.VALUE_TARGETS] = np.zeros_like(
traj[Postprocessing.ADVANTAGES])
traj[Postprocessing.ADVANTAGES] = traj[
Postprocessing.ADVANTAGES].copy().astype(np.float32)
assert all(val.shape[0] == trajsize for val in traj.values()), \
"Rollout stacked incorrectly!"
return SampleBatch(traj)
@@ -1,11 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import collections
# Define this in its own file, see #5125
RolloutMetrics = collections.namedtuple("RolloutMetrics", [
"episode_length", "episode_reward", "agent_rewards", "custom_metrics",
"perf_stats"
])
@@ -1,819 +0,0 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
import numpy as np
import gym
import logging
import pickle
import ray
from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari
from ray.rllib.env.base_env import BaseEnv
from ray.rllib.env.env_context import EnvContext
from ray.rllib.env.external_env import ExternalEnv
from ray.rllib.env.multi_agent_env import MultiAgentEnv
from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
from ray.rllib.env.vector_env import VectorEnv
from ray.rllib.evaluation.interface import EvaluatorInterface
from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
from ray.rllib.policy.sample_batch import MultiAgentBatch, DEFAULT_POLICY_ID
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.tf_policy import TFPolicy
from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader
from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator
from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator
from ray.rllib.models import ModelCatalog
from ray.rllib.models.preprocessors import NoPreprocessor
from ray.rllib.utils import merge_dicts
from ray.rllib.utils.annotations import override, DeveloperAPI
from ray.rllib.utils.debug import disable_log_once_globally, log_once, \
summarize, enable_periodic_logging
from ray.rllib.utils.filter import get_filter
from ray.rllib.utils.tf_run_builder import TFRunBuilder
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
logger = logging.getLogger(__name__)
# Handle to the current rollout worker, which will be set to the most recently
# created RolloutWorker in this process. This can be helpful to access in
# custom env or policy classes for debugging or advanced use cases.
_global_worker = None
@DeveloperAPI
def get_global_worker():
"""Returns a handle to the active rollout worker in this process."""
global _global_worker
return _global_worker
@DeveloperAPI
class RolloutWorker(EvaluatorInterface):
"""Common experience collection class.
This class wraps a policy instance and an environment class to
collect experiences from the environment. You can create many replicas of
this class as Ray actors to scale RL training.
This class supports vectorized and multi-agent policy evaluation (e.g.,
VectorEnv, MultiAgentEnv, etc.)
Examples:
>>> # Create a rollout worker and using it to collect experiences.
>>> worker = RolloutWorker(
... env_creator=lambda _: gym.make("CartPole-v0"),
... policy=PGTFPolicy)
>>> print(worker.sample())
SampleBatch({
"obs": [[...]], "actions": [[...]], "rewards": [[...]],
"dones": [[...]], "new_obs": [[...]]})
>>> # Creating a multi-agent rollout worker
>>> worker = RolloutWorker(
... env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25),
... policies={
... # Use an ensemble of two policies for car agents
... "car_policy1":
... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}),
... "car_policy2":
... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}),
... # Use a single shared policy for all traffic lights
... "traffic_light_policy":
... (PGTFPolicy, Box(...), Discrete(...), {}),
... },
... policy_mapping_fn=lambda agent_id:
... random.choice(["car_policy1", "car_policy2"])
... if agent_id.startswith("car_") else "traffic_light_policy")
>>> print(worker.sample())
MultiAgentBatch({
"car_policy1": SampleBatch(...),
"car_policy2": SampleBatch(...),
"traffic_light_policy": SampleBatch(...)})
"""
@DeveloperAPI
@classmethod
def as_remote(cls, num_cpus=None, num_gpus=None, resources=None):
return ray.remote(
num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)(cls)
@DeveloperAPI
def __init__(self,
env_creator,
policy,
policy_mapping_fn=None,
policies_to_train=None,
tf_session_creator=None,
batch_steps=100,
batch_mode="truncate_episodes",
episode_horizon=None,
preprocessor_pref="deepmind",
sample_async=False,
compress_observations=False,
num_envs=1,
observation_filter="NoFilter",
clip_rewards=None,
clip_actions=True,
env_config=None,
model_config=None,
policy_config=None,
worker_index=0,
monitor_path=None,
log_dir=None,
log_level=None,
callbacks=None,
input_creator=lambda ioctx: ioctx.default_sampler_input(),
input_evaluation=frozenset([]),
output_creator=lambda ioctx: NoopOutput(),
remote_worker_envs=False,
remote_env_batch_wait_ms=0,
soft_horizon=False,
no_done_at_end=False,
seed=None,
_fake_sampler=False):
"""Initialize a rollout worker.
Arguments:
env_creator (func): Function that returns a gym.Env given an
EnvContext wrapped configuration.
policy (class|dict): Either a class implementing
Policy, or a dictionary of policy id strings to
(Policy, obs_space, action_space, config) tuples. If a
dict is specified, then we are in multi-agent mode and a
policy_mapping_fn should also be set.
policy_mapping_fn (func): A function that maps agent ids to
policy ids in multi-agent mode. This function will be called
each time a new agent appears in an episode, to bind that agent
to a policy for the duration of the episode.
policies_to_train (list): Optional whitelist of policies to train,
or None for all policies.
tf_session_creator (func): A function that returns a TF session.
This is optional and only useful with TFPolicy.
batch_steps (int): The target number of env transitions to include
in each sample batch returned from this worker.
batch_mode (str): One of the following batch modes:
"truncate_episodes": Each call to sample() will return a batch
of at most `batch_steps * num_envs` in size. The batch will
be exactly `batch_steps * num_envs` in size if
postprocessing does not change batch sizes. Episodes may be
truncated in order to meet this size requirement.
"complete_episodes": Each call to sample() will return a batch
of at least `batch_steps * num_envs` in size. Episodes will
not be truncated, but multiple episodes may be packed
within one batch to meet the batch size. Note that when
`num_envs > 1`, episode steps will be buffered until the
episode completes, and hence batches may contain
significant amounts of off-policy data.
episode_horizon (int): Whether to stop episodes at this horizon.
preprocessor_pref (str): Whether to prefer RLlib preprocessors
("rllib") or deepmind ("deepmind") when applicable.
sample_async (bool): Whether to compute samples asynchronously in
the background, which improves throughput but can cause samples
to be slightly off-policy.
compress_observations (bool): If true, compress the observations.
They can be decompressed with rllib/utils/compression.
num_envs (int): If more than one, will create multiple envs
and vectorize the computation of actions. This has no effect if
if the env already implements VectorEnv.
observation_filter (str): Name of observation filter to use.
clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
experience postprocessing. Setting to None means clip for Atari
only.
clip_actions (bool): Whether to clip action values to the range
specified by the policy action space.
env_config (dict): Config to pass to the env creator.
model_config (dict): Config to use when creating the policy model.
policy_config (dict): Config to pass to the policy. In the
multi-agent case, this config will be merged with the
per-policy configs specified by `policy`.
worker_index (int): For remote workers, this should be set to a
non-zero and unique value. This index is passed to created envs
through EnvContext so that envs can be configured per worker.
monitor_path (str): Write out episode stats and videos to this
directory if specified.
log_dir (str): Directory where logs can be placed.
log_level (str): Set the root log level on creation.
callbacks (dict): Dict of custom debug callbacks.
input_creator (func): Function that returns an InputReader object
for loading previous generated experiences.
input_evaluation (list): How to evaluate the policy performance.
This only makes sense to set when the input is reading offline
data. The possible values include:
- "is": the step-wise importance sampling estimator.
- "wis": the weighted step-wise is estimator.
- "simulation": run the environment in the background, but
use this data for evaluation only and never for learning.
output_creator (func): Function that returns an OutputWriter object
for saving generated experiences.
remote_worker_envs (bool): If using num_envs > 1, whether to create
those new envs in remote processes instead of in the current
process. This adds overheads, but can make sense if your envs
remote_env_batch_wait_ms (float): Timeout that remote workers
are waiting when polling environments. 0 (continue when at
least one env is ready) is a reasonable default, but optimal
value could be obtained by measuring your environment
step / reset and model inference perf.
soft_horizon (bool): Calculate rewards but don't reset the
environment when the horizon is hit.
no_done_at_end (bool): Ignore the done=True at the end of the
episode and instead record done=False.
seed (int): Set the seed of both np and tf to this value to
to ensure each remote worker has unique exploration behavior.
_fake_sampler (bool): Use a fake (inf speed) sampler for testing.
"""
global _global_worker
_global_worker = self
if log_level:
logging.getLogger("ray.rllib").setLevel(log_level)
if worker_index > 1:
disable_log_once_globally() # only need 1 worker to log
elif log_level == "DEBUG":
enable_periodic_logging()
env_context = EnvContext(env_config or {}, worker_index)
policy_config = policy_config or {}
self.policy_config = policy_config
self.callbacks = callbacks or {}
self.worker_index = worker_index
model_config = model_config or {}
policy_mapping_fn = (policy_mapping_fn
or (lambda agent_id: DEFAULT_POLICY_ID))
if not callable(policy_mapping_fn):
raise ValueError(
"Policy mapping function not callable. If you're using Tune, "
"make sure to escape the function with tune.function() "
"to prevent it from being evaluated as an expression.")
self.env_creator = env_creator
self.sample_batch_size = batch_steps * num_envs
self.batch_mode = batch_mode
self.compress_observations = compress_observations
self.preprocessing_enabled = True
self.last_batch = None
self._fake_sampler = _fake_sampler
self.env = _validate_env(env_creator(env_context))
if isinstance(self.env, MultiAgentEnv) or \
isinstance(self.env, BaseEnv):
def wrap(env):
return env # we can't auto-wrap these env types
elif is_atari(self.env) and \
not model_config.get("custom_preprocessor") and \
preprocessor_pref == "deepmind":
# Deepmind wrappers already handle all preprocessing
self.preprocessing_enabled = False
if clip_rewards is None:
clip_rewards = True
def wrap(env):
env = wrap_deepmind(
env,
dim=model_config.get("dim"),
framestack=model_config.get("framestack"))
if monitor_path:
env = gym.wrappers.Monitor(env, monitor_path, resume=True)
return env
else:
def wrap(env):
if monitor_path:
env = gym.wrappers.Monitor(env, monitor_path, resume=True)
return env
self.env = wrap(self.env)
def make_env(vector_index):
return wrap(
env_creator(
env_context.copy_with_overrides(
vector_index=vector_index, remote=remote_worker_envs)))
self.tf_sess = None
policy_dict = _validate_and_canonicalize(policy, self.env)
self.policies_to_train = policies_to_train or list(policy_dict.keys())
# set numpy and python seed
if seed is not None:
np.random.seed(seed)
random.seed(seed)
if not hasattr(self.env, "seed"):
raise ValueError("Env doesn't support env.seed(): {}".format(
self.env))
self.env.seed(seed)
try:
import torch
torch.manual_seed(seed)
except ImportError:
logger.info("Could not seed torch")
if _has_tensorflow_graph(policy_dict):
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE
and not ray.get_gpu_ids()):
logger.debug("Creating policy evaluation worker {}".format(
worker_index) +
" on CPU (please ignore any CUDA init errors)")
if not tf:
raise ImportError("Could not import tensorflow")
with tf.Graph().as_default():
if tf_session_creator:
self.tf_sess = tf_session_creator()
else:
self.tf_sess = tf.Session(
config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
with self.tf_sess.as_default():
# set graph-level seed
if seed is not None:
tf.set_random_seed(seed)
self.policy_map, self.preprocessors = \
self._build_policy_map(policy_dict, policy_config)
else:
self.policy_map, self.preprocessors = self._build_policy_map(
policy_dict, policy_config)
self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
if self.multiagent:
if not ((isinstance(self.env, MultiAgentEnv)
or isinstance(self.env, ExternalMultiAgentEnv))
or isinstance(self.env, BaseEnv)):
raise ValueError(
"Have multiple policies {}, but the env ".format(
self.policy_map) +
"{} is not a subclass of BaseEnv, MultiAgentEnv or "
"ExternalMultiAgentEnv?".format(self.env))
self.filters = {
policy_id: get_filter(observation_filter,
policy.observation_space.shape)
for (policy_id, policy) in self.policy_map.items()
}
if self.worker_index == 0:
logger.info("Built filter map: {}".format(self.filters))
# Always use vector env for consistency even if num_envs = 1
self.async_env = BaseEnv.to_base_env(
self.env,
make_env=make_env,
num_envs=num_envs,
remote_envs=remote_worker_envs,
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
self.num_envs = num_envs
if self.batch_mode == "truncate_episodes":
unroll_length = batch_steps
pack_episodes = True
elif self.batch_mode == "complete_episodes":
unroll_length = float("inf") # never cut episodes
pack_episodes = False # sampler will return 1 episode per poll
else:
raise ValueError("Unsupported batch mode: {}".format(
self.batch_mode))
self.io_context = IOContext(log_dir, policy_config, worker_index, self)
self.reward_estimators = []
for method in input_evaluation:
if method == "simulation":
logger.warning(
"Requested 'simulation' input evaluation method: "
"will discard all sampler outputs and keep only metrics.")
sample_async = True
elif method == "is":
ise = ImportanceSamplingEstimator.create(self.io_context)
self.reward_estimators.append(ise)
elif method == "wis":
wise = WeightedImportanceSamplingEstimator.create(
self.io_context)
self.reward_estimators.append(wise)
else:
raise ValueError(
"Unknown evaluation method: {}".format(method))
if sample_async:
self.sampler = AsyncSampler(
self.async_env,
self.policy_map,
policy_mapping_fn,
self.preprocessors,
self.filters,
clip_rewards,
unroll_length,
self.callbacks,
horizon=episode_horizon,
pack=pack_episodes,
tf_sess=self.tf_sess,
clip_actions=clip_actions,
blackhole_outputs="simulation" in input_evaluation,
soft_horizon=soft_horizon,
no_done_at_end=no_done_at_end)
self.sampler.start()
else:
self.sampler = SyncSampler(
self.async_env,
self.policy_map,
policy_mapping_fn,
self.preprocessors,
self.filters,
clip_rewards,
unroll_length,
self.callbacks,
horizon=episode_horizon,
pack=pack_episodes,
tf_sess=self.tf_sess,
clip_actions=clip_actions,
soft_horizon=soft_horizon,
no_done_at_end=no_done_at_end)
self.input_reader = input_creator(self.io_context)
assert isinstance(self.input_reader, InputReader), self.input_reader
self.output_writer = output_creator(self.io_context)
assert isinstance(self.output_writer, OutputWriter), self.output_writer
logger.debug(
"Created rollout worker with env {} ({}), policies {}".format(
self.async_env, self.env, self.policy_map))
@override(EvaluatorInterface)
def sample(self):
"""Evaluate the current policies and return a batch of experiences.
Return:
SampleBatch|MultiAgentBatch from evaluating the current policies.
"""
if self._fake_sampler and self.last_batch is not None:
return self.last_batch
if log_once("sample_start"):
logger.info("Generating sample batch of size {}".format(
self.sample_batch_size))
batches = [self.input_reader.next()]
steps_so_far = batches[0].count
# In truncate_episodes mode, never pull more than 1 batch per env.
# This avoids over-running the target batch size.
if self.batch_mode == "truncate_episodes":
max_batches = self.num_envs
else:
max_batches = float("inf")
while steps_so_far < self.sample_batch_size and len(
batches) < max_batches:
batch = self.input_reader.next()
steps_so_far += batch.count
batches.append(batch)
batch = batches[0].concat_samples(batches)
if self.callbacks.get("on_sample_end"):
self.callbacks["on_sample_end"]({"worker": self, "samples": batch})
# Always do writes prior to compression for consistency and to allow
# for better compression inside the writer.
self.output_writer.write(batch)
# Do off-policy estimation if needed
if self.reward_estimators:
for sub_batch in batch.split_by_episode():
for estimator in self.reward_estimators:
estimator.process(sub_batch)
if log_once("sample_end"):
logger.info("Completed sample batch:\n\n{}\n".format(
summarize(batch)))
if self.compress_observations == "bulk":
batch.compress(bulk=True)
elif self.compress_observations:
batch.compress()
if self._fake_sampler:
self.last_batch = batch
return batch
@DeveloperAPI
@ray.method(num_return_vals=2)
def sample_with_count(self):
"""Same as sample() but returns the count as a separate future."""
batch = self.sample()
return batch, batch.count
@override(EvaluatorInterface)
def get_weights(self, policies=None):
if policies is None:
policies = self.policy_map.keys()
return {
pid: policy.get_weights()
for pid, policy in self.policy_map.items() if pid in policies
}
@override(EvaluatorInterface)
def set_weights(self, weights):
for pid, w in weights.items():
self.policy_map[pid].set_weights(w)
@override(EvaluatorInterface)
def compute_gradients(self, samples):
if log_once("compute_gradients"):
logger.info("Compute gradients on:\n\n{}\n".format(
summarize(samples)))
if isinstance(samples, MultiAgentBatch):
grad_out, info_out = {}, {}
if self.tf_sess is not None:
builder = TFRunBuilder(self.tf_sess, "compute_gradients")
for pid, batch in samples.policy_batches.items():
if pid not in self.policies_to_train:
continue
grad_out[pid], info_out[pid] = (
self.policy_map[pid]._build_compute_gradients(
builder, batch))
grad_out = {k: builder.get(v) for k, v in grad_out.items()}
info_out = {k: builder.get(v) for k, v in info_out.items()}
else:
for pid, batch in samples.policy_batches.items():
if pid not in self.policies_to_train:
continue
grad_out[pid], info_out[pid] = (
self.policy_map[pid].compute_gradients(batch))
else:
grad_out, info_out = (
self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples))
info_out["batch_count"] = samples.count
if log_once("grad_out"):
logger.info("Compute grad info:\n\n{}\n".format(
summarize(info_out)))
return grad_out, info_out
@override(EvaluatorInterface)
def apply_gradients(self, grads):
if log_once("apply_gradients"):
logger.info("Apply gradients:\n\n{}\n".format(summarize(grads)))
if isinstance(grads, dict):
if self.tf_sess is not None:
builder = TFRunBuilder(self.tf_sess, "apply_gradients")
outputs = {
pid: self.policy_map[pid]._build_apply_gradients(
builder, grad)
for pid, grad in grads.items()
}
return {k: builder.get(v) for k, v in outputs.items()}
else:
return {
pid: self.policy_map[pid].apply_gradients(g)
for pid, g in grads.items()
}
else:
return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)
@override(EvaluatorInterface)
def learn_on_batch(self, samples):
if log_once("learn_on_batch"):
logger.info(
"Training on concatenated sample batches:\n\n{}\n".format(
summarize(samples)))
if isinstance(samples, MultiAgentBatch):
info_out = {}
to_fetch = {}
if self.tf_sess is not None:
builder = TFRunBuilder(self.tf_sess, "learn_on_batch")
else:
builder = None
for pid, batch in samples.policy_batches.items():
if pid not in self.policies_to_train:
continue
policy = self.policy_map[pid]
if builder and hasattr(policy, "_build_learn_on_batch"):
to_fetch[pid] = policy._build_learn_on_batch(
builder, batch)
else:
info_out[pid] = policy.learn_on_batch(batch)
info_out.update({k: builder.get(v) for k, v in to_fetch.items()})
else:
info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch(
samples)
if log_once("learn_out"):
logger.info("Training output:\n\n{}\n".format(summarize(info_out)))
return info_out
@DeveloperAPI
def get_metrics(self):
"""Returns a list of new RolloutMetric objects from evaluation."""
out = self.sampler.get_metrics()
for m in self.reward_estimators:
out.extend(m.get_metrics())
return out
@DeveloperAPI
def foreach_env(self, func):
"""Apply the given function to each underlying env instance."""
envs = self.async_env.get_unwrapped()
if not envs:
return [func(self.async_env)]
else:
return [func(e) for e in envs]
@DeveloperAPI
def get_policy(self, policy_id=DEFAULT_POLICY_ID):
"""Return policy for the specified id, or None.
Arguments:
policy_id (str): id of policy to return.
"""
return self.policy_map.get(policy_id)
@DeveloperAPI
def for_policy(self, func, policy_id=DEFAULT_POLICY_ID):
"""Apply the given function to the specified policy."""
return func(self.policy_map[policy_id])
@DeveloperAPI
def foreach_policy(self, func):
"""Apply the given function to each (policy, policy_id) tuple."""
return [func(policy, pid) for pid, policy in self.policy_map.items()]
@DeveloperAPI
def foreach_trainable_policy(self, func):
"""Apply the given function to each (policy, policy_id) tuple.
This only applies func to policies in `self.policies_to_train`."""
return [
func(policy, pid) for pid, policy in self.policy_map.items()
if pid in self.policies_to_train
]
@DeveloperAPI
def sync_filters(self, new_filters):
"""Changes self's filter to given and rebases any accumulated delta.
Args:
new_filters (dict): Filters with new state to update local copy.
"""
assert all(k in new_filters for k in self.filters)
for k in self.filters:
self.filters[k].sync(new_filters[k])
@DeveloperAPI
def get_filters(self, flush_after=False):
"""Returns a snapshot of filters.
Args:
flush_after (bool): Clears the filter buffer state.
Returns:
return_filters (dict): Dict for serializable filters
"""
return_filters = {}
for k, f in self.filters.items():
return_filters[k] = f.as_serializable()
if flush_after:
f.clear_buffer()
return return_filters
@DeveloperAPI
def save(self):
filters = self.get_filters(flush_after=True)
state = {
pid: self.policy_map[pid].get_state()
for pid in self.policy_map
}
return pickle.dumps({"filters": filters, "state": state})
@DeveloperAPI
def restore(self, objs):
objs = pickle.loads(objs)
self.sync_filters(objs["filters"])
for pid, state in objs["state"].items():
self.policy_map[pid].set_state(state)
@DeveloperAPI
def set_global_vars(self, global_vars):
self.foreach_policy(lambda p, _: p.on_global_var_update(global_vars))
@DeveloperAPI
def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID):
self.policy_map[policy_id].export_model(export_dir)
@DeveloperAPI
def export_policy_checkpoint(self,
export_dir,
filename_prefix="model",
policy_id=DEFAULT_POLICY_ID):
self.policy_map[policy_id].export_checkpoint(export_dir,
filename_prefix)
@DeveloperAPI
def stop(self):
self.async_env.stop()
def _build_policy_map(self, policy_dict, policy_config):
policy_map = {}
preprocessors = {}
for name, (cls, obs_space, act_space,
conf) in sorted(policy_dict.items()):
logger.debug("Creating policy for {}".format(name))
merged_conf = merge_dicts(policy_config, conf)
if self.preprocessing_enabled:
preprocessor = ModelCatalog.get_preprocessor_for_space(
obs_space, merged_conf.get("model"))
preprocessors[name] = preprocessor
obs_space = preprocessor.observation_space
else:
preprocessors[name] = NoPreprocessor(obs_space)
if isinstance(obs_space, gym.spaces.Dict) or \
isinstance(obs_space, gym.spaces.Tuple):
raise ValueError(
"Found raw Tuple|Dict space as input to policy. "
"Please preprocess these observations with a "
"Tuple|DictFlatteningPreprocessor.")
if tf:
with tf.variable_scope(name):
policy_map[name] = cls(obs_space, act_space, merged_conf)
else:
policy_map[name] = cls(obs_space, act_space, merged_conf)
if self.worker_index == 0:
logger.info("Built policy map: {}".format(policy_map))
logger.info("Built preprocessor map: {}".format(preprocessors))
return policy_map, preprocessors
def __del__(self):
if hasattr(self, "sampler") and isinstance(self.sampler, AsyncSampler):
self.sampler.shutdown = True
def _validate_and_canonicalize(policy, env):
if isinstance(policy, dict):
_validate_multiagent_config(policy)
return policy
elif not issubclass(policy, Policy):
raise ValueError("policy must be a rllib.Policy class")
else:
if (isinstance(env, MultiAgentEnv)
and not hasattr(env, "observation_space")):
raise ValueError(
"MultiAgentEnv must have observation_space defined if run "
"in a single-agent configuration.")
return {
DEFAULT_POLICY_ID: (policy, env.observation_space,
env.action_space, {})
}
def _validate_multiagent_config(policy, allow_none_graph=False):
for k, v in policy.items():
if not isinstance(k, str):
raise ValueError("policy keys must be strs, got {}".format(
type(k)))
if not isinstance(v, (tuple, list)) or len(v) != 4:
raise ValueError(
"policy values must be tuples/lists of "
"(cls or None, obs_space, action_space, config), got {}".
format(v))
if allow_none_graph and v[0] is None:
pass
elif not issubclass(v[0], Policy):
raise ValueError("policy tuple value 0 must be a rllib.Policy "
"class or None, got {}".format(v[0]))
if not isinstance(v[1], gym.Space):
raise ValueError(
"policy tuple value 1 (observation_space) must be a "
"gym.Space, got {}".format(type(v[1])))
if not isinstance(v[2], gym.Space):
raise ValueError("policy tuple value 2 (action_space) must be a "
"gym.Space, got {}".format(type(v[2])))
if not isinstance(v[3], dict):
raise ValueError("policy tuple value 3 (config) must be a dict, "
"got {}".format(type(v[3])))
def _validate_env(env):
# allow this as a special case (assumed gym.Env)
if hasattr(env, "observation_space") and hasattr(env, "action_space"):
return env
allowed_types = [gym.Env, MultiAgentEnv, ExternalEnv, VectorEnv, BaseEnv]
if not any(isinstance(env, tpe) for tpe in allowed_types):
raise ValueError(
"Returned env should be an instance of gym.Env, MultiAgentEnv, "
"ExternalEnv, VectorEnv, or BaseEnv. The provided env creator "
"function returned {} ({}).".format(env, type(env)))
return env
def _has_tensorflow_graph(policy_dict):
for policy, _, _, _ in policy_dict.values():
if issubclass(policy, TFPolicy):
return True
return False

Some files were not shown because too many files have changed in this diff Show More