mirror of
https://github.com/wassname/ray.git
synced 2026-07-04 11:54:54 +08:00
[rllib] Try moving RLlib to top level dir (#5324)
This commit is contained in:
Symlink
+1
@@ -0,0 +1 @@
|
||||
../../rllib
|
||||
@@ -1,25 +0,0 @@
|
||||
RLlib: Scalable Reinforcement Learning
|
||||
======================================
|
||||
|
||||
RLlib is an open-source library for reinforcement learning that offers both high scalability and a unified API for a variety of applications.
|
||||
|
||||
For an overview of RLlib, see the [documentation](http://ray.readthedocs.io/en/latest/rllib.html).
|
||||
|
||||
If you've found RLlib useful for your research, you can cite the [paper](https://arxiv.org/abs/1712.09381) as follows:
|
||||
|
||||
```
|
||||
@inproceedings{liang2018rllib,
|
||||
Author = {Eric Liang and
|
||||
Richard Liaw and
|
||||
Robert Nishihara and
|
||||
Philipp Moritz and
|
||||
Roy Fox and
|
||||
Ken Goldberg and
|
||||
Joseph E. Gonzalez and
|
||||
Michael I. Jordan and
|
||||
Ion Stoica},
|
||||
Title = {{RLlib}: Abstractions for Distributed Reinforcement Learning},
|
||||
Booktitle = {International Conference on Machine Learning ({ICML})},
|
||||
Year = {2018}
|
||||
}
|
||||
```
|
||||
@@ -1,65 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
# Note: do not introduce unnecessary library dependencies here, e.g. gym.
|
||||
# This file is imported from the tune module in order to register RLlib agents.
|
||||
from ray.tune.registry import register_trainable
|
||||
|
||||
from ray.rllib.evaluation.policy_graph import PolicyGraph
|
||||
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
|
||||
from ray.rllib.evaluation.rollout_worker import RolloutWorker
|
||||
from ray.rllib.env.base_env import BaseEnv
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.env.external_env import ExternalEnv
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.policy.tf_policy import TFPolicy
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
|
||||
|
||||
def _setup_logger():
|
||||
logger = logging.getLogger("ray.rllib")
|
||||
handler = logging.StreamHandler()
|
||||
handler.setFormatter(
|
||||
logging.Formatter(
|
||||
"%(asctime)s\t%(levelname)s %(filename)s:%(lineno)s -- %(message)s"
|
||||
))
|
||||
logger.addHandler(handler)
|
||||
logger.propagate = False
|
||||
|
||||
if sys.version_info[0] < 3:
|
||||
logger.warn(
|
||||
"RLlib Python 2 support is deprecated, and will be removed "
|
||||
"in a future release.")
|
||||
|
||||
|
||||
def _register_all():
|
||||
|
||||
from ray.rllib.agents.registry import ALGORITHMS
|
||||
from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS
|
||||
for key in list(ALGORITHMS.keys()) + list(CONTRIBUTED_ALGORITHMS.keys(
|
||||
)) + ["__fake", "__sigmoid_fake_data", "__parameter_tuning"]:
|
||||
from ray.rllib.agents.registry import get_agent_class
|
||||
register_trainable(key, get_agent_class(key))
|
||||
|
||||
|
||||
_setup_logger()
|
||||
_register_all()
|
||||
|
||||
__all__ = [
|
||||
"Policy",
|
||||
"PolicyGraph",
|
||||
"TFPolicy",
|
||||
"TFPolicyGraph",
|
||||
"RolloutWorker",
|
||||
"PolicyEvaluator",
|
||||
"SampleBatch",
|
||||
"BaseEnv",
|
||||
"MultiAgentEnv",
|
||||
"VectorEnv",
|
||||
"ExternalEnv",
|
||||
]
|
||||
@@ -1,4 +0,0 @@
|
||||
from ray.rllib.agents.trainer import Trainer, with_common_config
|
||||
from ray.rllib.agents.agent import Agent
|
||||
|
||||
__all__ = ["Agent", "Trainer", "with_common_config"]
|
||||
@@ -1,10 +0,0 @@
|
||||
from ray.rllib.agents.a3c.a3c import A3CTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.agents.a3c.a2c import A2CTrainer
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
A2CAgent = renamed_agent(A2CTrainer)
|
||||
A3CAgent = renamed_agent(A3CTrainer)
|
||||
|
||||
__all__ = [
|
||||
"A2CAgent", "A3CAgent", "A2CTrainer", "A3CTrainer", "DEFAULT_CONFIG"
|
||||
]
|
||||
@@ -1,25 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.a3c.a3c import DEFAULT_CONFIG as A3C_CONFIG, \
|
||||
validate_config, get_policy_class
|
||||
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
A2C_DEFAULT_CONFIG = merge_dicts(
|
||||
A3C_CONFIG,
|
||||
{
|
||||
"sample_batch_size": 20,
|
||||
"min_iter_time_s": 10,
|
||||
"sample_async": False,
|
||||
},
|
||||
)
|
||||
|
||||
A2CTrainer = build_trainer(
|
||||
name="A2C",
|
||||
default_config=A2C_DEFAULT_CONFIG,
|
||||
default_policy=A3CTFPolicy,
|
||||
get_policy_class=get_policy_class,
|
||||
validate_config=validate_config)
|
||||
@@ -1,67 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.optimizers import AsyncGradientsOptimizer
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# Size of rollout batch
|
||||
"sample_batch_size": 10,
|
||||
# Use PyTorch as backend - no LSTM support
|
||||
"use_pytorch": False,
|
||||
# GAE(gamma) parameter
|
||||
"lambda": 1.0,
|
||||
# Max global norm for each gradient calculated by worker
|
||||
"grad_clip": 40.0,
|
||||
# Learning rate
|
||||
"lr": 0.0001,
|
||||
# Learning rate schedule
|
||||
"lr_schedule": None,
|
||||
# Value Function Loss coefficient
|
||||
"vf_loss_coeff": 0.5,
|
||||
# Entropy coefficient
|
||||
"entropy_coeff": 0.01,
|
||||
# Min time per iteration
|
||||
"min_iter_time_s": 5,
|
||||
# Workers sample async. Note that this increases the effective
|
||||
# sample_batch_size by up to 5x due to async buffering of batches.
|
||||
"sample_async": True,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def get_policy_class(config):
|
||||
if config["use_pytorch"]:
|
||||
from ray.rllib.agents.a3c.a3c_torch_policy import \
|
||||
A3CTorchPolicy
|
||||
return A3CTorchPolicy
|
||||
else:
|
||||
return A3CTFPolicy
|
||||
|
||||
|
||||
def validate_config(config):
|
||||
if config["entropy_coeff"] < 0:
|
||||
raise DeprecationWarning("entropy_coeff must be >= 0")
|
||||
if config["sample_async"] and config["use_pytorch"]:
|
||||
raise ValueError(
|
||||
"The sample_async option is not supported with use_pytorch: "
|
||||
"Multithreading can be lead to crashes if used with pytorch.")
|
||||
|
||||
|
||||
def make_async_optimizer(workers, config):
|
||||
return AsyncGradientsOptimizer(workers, **config["optimizer"])
|
||||
|
||||
|
||||
A3CTrainer = build_trainer(
|
||||
name="A3C",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=A3CTFPolicy,
|
||||
get_policy_class=get_policy_class,
|
||||
validate_config=validate_config,
|
||||
make_policy_optimizer=make_async_optimizer)
|
||||
@@ -1,133 +0,0 @@
|
||||
"""Note: Keep in sync with changes to VTraceTFPolicy."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class A3CLoss(object):
|
||||
def __init__(self,
|
||||
action_dist,
|
||||
actions,
|
||||
advantages,
|
||||
v_target,
|
||||
vf,
|
||||
vf_loss_coeff=0.5,
|
||||
entropy_coeff=0.01):
|
||||
log_prob = action_dist.logp(actions)
|
||||
|
||||
# The "policy gradients" loss
|
||||
self.pi_loss = -tf.reduce_sum(log_prob * advantages)
|
||||
|
||||
delta = vf - v_target
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(action_dist.entropy())
|
||||
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
|
||||
self.entropy * entropy_coeff)
|
||||
|
||||
|
||||
def actor_critic_loss(policy, batch_tensors):
|
||||
policy.loss = A3CLoss(
|
||||
policy.action_dist, batch_tensors[SampleBatch.ACTIONS],
|
||||
batch_tensors[Postprocessing.ADVANTAGES],
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS], policy.vf,
|
||||
policy.config["vf_loss_coeff"], policy.config["entropy_coeff"])
|
||||
return policy.loss.total_loss
|
||||
|
||||
|
||||
def postprocess_advantages(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
completed = sample_batch[SampleBatch.DONES][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
next_state = []
|
||||
for i in range(len(policy.state_in)):
|
||||
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
|
||||
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
|
||||
sample_batch[SampleBatch.ACTIONS][-1],
|
||||
sample_batch[SampleBatch.REWARDS][-1],
|
||||
*next_state)
|
||||
return compute_advantages(sample_batch, last_r, policy.config["gamma"],
|
||||
policy.config["lambda"])
|
||||
|
||||
|
||||
def add_value_function_fetch(policy):
|
||||
return {SampleBatch.VF_PREDS: policy.vf}
|
||||
|
||||
|
||||
class ValueNetworkMixin(object):
|
||||
def __init__(self):
|
||||
self.vf = self.model.value_function()
|
||||
|
||||
def _value(self, ob, prev_action, prev_reward, *args):
|
||||
feed_dict = {
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): [ob],
|
||||
self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action],
|
||||
self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward],
|
||||
self.seq_lens: [1]
|
||||
}
|
||||
assert len(args) == len(self.state_in), \
|
||||
(args, self.state_in)
|
||||
for k, v in zip(self.state_in, args):
|
||||
feed_dict[k] = v
|
||||
vf = self.get_session().run(self.vf, feed_dict)
|
||||
return vf[0]
|
||||
|
||||
|
||||
def stats(policy, batch_tensors):
|
||||
return {
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"policy_entropy": policy.loss.entropy,
|
||||
"var_gnorm": tf.global_norm([x for x in policy.var_list]),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
}
|
||||
|
||||
|
||||
def grad_stats(policy, grads):
|
||||
return {
|
||||
"grad_gnorm": tf.global_norm(grads),
|
||||
"vf_explained_var": explained_variance(
|
||||
policy.get_placeholder(Postprocessing.VALUE_TARGETS), policy.vf),
|
||||
}
|
||||
|
||||
|
||||
def clip_gradients(policy, optimizer, loss):
|
||||
grads = tf.gradients(loss, policy.var_list)
|
||||
grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
|
||||
clipped_grads = list(zip(grads, policy.var_list))
|
||||
return clipped_grads
|
||||
|
||||
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
ValueNetworkMixin.__init__(policy)
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
|
||||
|
||||
A3CTFPolicy = build_tf_policy(
|
||||
name="A3CTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
|
||||
loss_fn=actor_critic_loss,
|
||||
stats_fn=stats,
|
||||
grad_stats_fn=grad_stats,
|
||||
gradients_fn=clip_gradients,
|
||||
postprocess_fn=postprocess_advantages,
|
||||
extra_action_fetches_fn=add_value_function_fetch,
|
||||
before_loss_init=setup_mixins,
|
||||
mixins=[ValueNetworkMixin, LearningRateSchedule])
|
||||
@@ -1,91 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from torch import nn
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.torch_policy_template import build_torch_policy
|
||||
|
||||
|
||||
def actor_critic_loss(policy, batch_tensors):
|
||||
logits, _ = policy.model({
|
||||
SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
|
||||
}) # TODO(ekl) seq lens shouldn't be None
|
||||
values = policy.model.value_function()
|
||||
dist = policy.dist_class(logits)
|
||||
log_probs = dist.logp(batch_tensors[SampleBatch.ACTIONS])
|
||||
policy.entropy = dist.entropy().mean()
|
||||
policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
|
||||
log_probs.reshape(-1))
|
||||
policy.value_err = F.mse_loss(
|
||||
values.reshape(-1), batch_tensors[Postprocessing.VALUE_TARGETS])
|
||||
overall_err = sum([
|
||||
policy.pi_err,
|
||||
policy.config["vf_loss_coeff"] * policy.value_err,
|
||||
-policy.config["entropy_coeff"] * policy.entropy,
|
||||
])
|
||||
return overall_err
|
||||
|
||||
|
||||
def loss_and_entropy_stats(policy, batch_tensors):
|
||||
return {
|
||||
"policy_entropy": policy.entropy.item(),
|
||||
"policy_loss": policy.pi_err.item(),
|
||||
"vf_loss": policy.value_err.item(),
|
||||
}
|
||||
|
||||
|
||||
def add_advantages(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
completed = sample_batch[SampleBatch.DONES][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1])
|
||||
return compute_advantages(sample_batch, last_r, policy.config["gamma"],
|
||||
policy.config["lambda"])
|
||||
|
||||
|
||||
def model_value_predictions(policy, input_dict, state_batches, model):
|
||||
return {SampleBatch.VF_PREDS: model.value_function().cpu().numpy()}
|
||||
|
||||
|
||||
def apply_grad_clipping(policy):
|
||||
info = {}
|
||||
if policy.config["grad_clip"]:
|
||||
total_norm = nn.utils.clip_grad_norm_(policy.model.parameters(),
|
||||
policy.config["grad_clip"])
|
||||
info["grad_gnorm"] = total_norm
|
||||
return info
|
||||
|
||||
|
||||
def torch_optimizer(policy, config):
|
||||
return torch.optim.Adam(policy.model.parameters(), lr=config["lr"])
|
||||
|
||||
|
||||
class ValueNetworkMixin(object):
|
||||
def _value(self, obs):
|
||||
with self.lock:
|
||||
obs = torch.from_numpy(obs).float().unsqueeze(0).to(self.device)
|
||||
_ = self.model({"obs": obs}, [], [1])
|
||||
return self.model.value_function().detach().cpu().numpy().squeeze()
|
||||
|
||||
|
||||
A3CTorchPolicy = build_torch_policy(
|
||||
name="A3CTorchPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
|
||||
loss_fn=actor_critic_loss,
|
||||
stats_fn=loss_and_entropy_stats,
|
||||
postprocess_fn=add_advantages,
|
||||
extra_action_out_fn=model_value_predictions,
|
||||
extra_grad_process_fn=apply_grad_clipping,
|
||||
optimizer_fn=torch_optimizer,
|
||||
mixins=[ValueNetworkMixin])
|
||||
@@ -1,8 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.trainer import Trainer
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
Agent = renamed_agent(Trainer)
|
||||
@@ -1,6 +0,0 @@
|
||||
from ray.rllib.agents.ars.ars import (ARSTrainer, DEFAULT_CONFIG)
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
ARSAgent = renamed_agent(ARSTrainer)
|
||||
|
||||
__all__ = ["ARSAgent", "ARSTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,340 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter and from
|
||||
# https://github.com/modestyachts/ARS
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents import Trainer, with_common_config
|
||||
|
||||
from ray.rllib.agents.ars import optimizers
|
||||
from ray.rllib.agents.ars import policies
|
||||
from ray.rllib.agents.ars import utils
|
||||
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.memory import ray_get_and_free
|
||||
from ray.rllib.utils import FilterManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Result = namedtuple("Result", [
|
||||
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
|
||||
"eval_returns", "eval_lengths"
|
||||
])
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
"noise_stdev": 0.02, # std deviation of parameter noise
|
||||
"num_rollouts": 32, # number of perturbs to try
|
||||
"rollouts_used": 32, # number of perturbs to keep in gradient estimate
|
||||
"num_workers": 2,
|
||||
"sgd_stepsize": 0.01, # sgd step-size
|
||||
"observation_filter": "MeanStdFilter",
|
||||
"noise_size": 250000000,
|
||||
"eval_prob": 0.03, # probability of evaluating the parameter rewards
|
||||
"report_length": 10, # how many of the last rewards we average over
|
||||
"offset": 0,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@ray.remote
|
||||
def create_shared_noise(count):
|
||||
"""Create a large array of noise to be shared by all workers."""
|
||||
seed = 123
|
||||
noise = np.random.RandomState(seed).randn(count).astype(np.float32)
|
||||
return noise
|
||||
|
||||
|
||||
class SharedNoiseTable(object):
|
||||
def __init__(self, noise):
|
||||
self.noise = noise
|
||||
assert self.noise.dtype == np.float32
|
||||
|
||||
def get(self, i, dim):
|
||||
return self.noise[i:i + dim]
|
||||
|
||||
def sample_index(self, dim):
|
||||
return np.random.randint(0, len(self.noise) - dim + 1)
|
||||
|
||||
def get_delta(self, dim):
|
||||
idx = self.sample_index(dim)
|
||||
return idx, self.get(idx, dim)
|
||||
|
||||
|
||||
@ray.remote
|
||||
class Worker(object):
|
||||
def __init__(self, config, env_creator, noise, min_task_runtime=0.2):
|
||||
self.min_task_runtime = min_task_runtime
|
||||
self.config = config
|
||||
self.noise = SharedNoiseTable(noise)
|
||||
|
||||
self.env = env_creator(config["env_config"])
|
||||
from ray.rllib import models
|
||||
self.preprocessor = models.ModelCatalog.get_preprocessor(self.env)
|
||||
|
||||
self.sess = utils.make_session(single_threaded=True)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, self.env.action_space, self.env.observation_space,
|
||||
self.preprocessor, config["observation_filter"], config["model"])
|
||||
|
||||
@property
|
||||
def filters(self):
|
||||
return {DEFAULT_POLICY_ID: self.policy.get_filter()}
|
||||
|
||||
def sync_filters(self, new_filters):
|
||||
for k in self.filters:
|
||||
self.filters[k].sync(new_filters[k])
|
||||
|
||||
def get_filters(self, flush_after=False):
|
||||
return_filters = {}
|
||||
for k, f in self.filters.items():
|
||||
return_filters[k] = f.as_serializable()
|
||||
if flush_after:
|
||||
f.clear_buffer()
|
||||
return return_filters
|
||||
|
||||
def rollout(self, timestep_limit, add_noise=False):
|
||||
rollout_rewards, rollout_length = policies.rollout(
|
||||
self.policy,
|
||||
self.env,
|
||||
timestep_limit=timestep_limit,
|
||||
add_noise=add_noise,
|
||||
offset=self.config["offset"])
|
||||
return rollout_rewards, rollout_length
|
||||
|
||||
def do_rollouts(self, params, timestep_limit=None):
|
||||
# Set the network weights.
|
||||
self.policy.set_weights(params)
|
||||
|
||||
noise_indices, returns, sign_returns, lengths = [], [], [], []
|
||||
eval_returns, eval_lengths = [], []
|
||||
|
||||
# Perform some rollouts with noise.
|
||||
while (len(noise_indices) == 0):
|
||||
if np.random.uniform() < self.config["eval_prob"]:
|
||||
# Do an evaluation run with no perturbation.
|
||||
self.policy.set_weights(params)
|
||||
rewards, length = self.rollout(timestep_limit, add_noise=False)
|
||||
eval_returns.append(rewards.sum())
|
||||
eval_lengths.append(length)
|
||||
else:
|
||||
# Do a regular run with parameter perturbations.
|
||||
noise_index = self.noise.sample_index(self.policy.num_params)
|
||||
|
||||
perturbation = self.config["noise_stdev"] * self.noise.get(
|
||||
noise_index, self.policy.num_params)
|
||||
|
||||
# These two sampling steps could be done in parallel on
|
||||
# different actors letting us update twice as frequently.
|
||||
self.policy.set_weights(params + perturbation)
|
||||
rewards_pos, lengths_pos = self.rollout(timestep_limit)
|
||||
|
||||
self.policy.set_weights(params - perturbation)
|
||||
rewards_neg, lengths_neg = self.rollout(timestep_limit)
|
||||
|
||||
noise_indices.append(noise_index)
|
||||
returns.append([rewards_pos.sum(), rewards_neg.sum()])
|
||||
sign_returns.append(
|
||||
[np.sign(rewards_pos).sum(),
|
||||
np.sign(rewards_neg).sum()])
|
||||
lengths.append([lengths_pos, lengths_neg])
|
||||
|
||||
return Result(
|
||||
noise_indices=noise_indices,
|
||||
noisy_returns=returns,
|
||||
sign_noisy_returns=sign_returns,
|
||||
noisy_lengths=lengths,
|
||||
eval_returns=eval_returns,
|
||||
eval_lengths=eval_lengths)
|
||||
|
||||
|
||||
class ARSTrainer(Trainer):
|
||||
"""Large-scale implementation of Augmented Random Search in Ray."""
|
||||
|
||||
_name = "ARS"
|
||||
_default_config = DEFAULT_CONFIG
|
||||
|
||||
@override(Trainer)
|
||||
def _init(self, config, env_creator):
|
||||
env = env_creator(config["env_config"])
|
||||
from ray.rllib import models
|
||||
preprocessor = models.ModelCatalog.get_preprocessor(env)
|
||||
|
||||
self.sess = utils.make_session(single_threaded=False)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, env.action_space, env.observation_space, preprocessor,
|
||||
config["observation_filter"], config["model"])
|
||||
self.optimizer = optimizers.SGD(self.policy, config["sgd_stepsize"])
|
||||
|
||||
self.rollouts_used = config["rollouts_used"]
|
||||
self.num_rollouts = config["num_rollouts"]
|
||||
self.report_length = config["report_length"]
|
||||
|
||||
# Create the shared noise table.
|
||||
logger.info("Creating shared noise table.")
|
||||
noise_id = create_shared_noise.remote(config["noise_size"])
|
||||
self.noise = SharedNoiseTable(ray.get(noise_id))
|
||||
|
||||
# Create the actors.
|
||||
logger.info("Creating actors.")
|
||||
self.workers = [
|
||||
Worker.remote(config, env_creator, noise_id)
|
||||
for _ in range(config["num_workers"])
|
||||
]
|
||||
|
||||
self.episodes_so_far = 0
|
||||
self.reward_list = []
|
||||
self.tstart = time.time()
|
||||
|
||||
@override(Trainer)
|
||||
def _train(self):
|
||||
config = self.config
|
||||
|
||||
theta = self.policy.get_weights()
|
||||
assert theta.dtype == np.float32
|
||||
|
||||
# Put the current policy weights in the object store.
|
||||
theta_id = ray.put(theta)
|
||||
# Use the actors to do rollouts, note that we pass in the ID of the
|
||||
# policy weights.
|
||||
results, num_episodes, num_timesteps = self._collect_results(
|
||||
theta_id, config["num_rollouts"])
|
||||
|
||||
all_noise_indices = []
|
||||
all_training_returns = []
|
||||
all_training_lengths = []
|
||||
all_eval_returns = []
|
||||
all_eval_lengths = []
|
||||
|
||||
# Loop over the results.
|
||||
for result in results:
|
||||
all_eval_returns += result.eval_returns
|
||||
all_eval_lengths += result.eval_lengths
|
||||
|
||||
all_noise_indices += result.noise_indices
|
||||
all_training_returns += result.noisy_returns
|
||||
all_training_lengths += result.noisy_lengths
|
||||
|
||||
assert len(all_eval_returns) == len(all_eval_lengths)
|
||||
assert (len(all_noise_indices) == len(all_training_returns) ==
|
||||
len(all_training_lengths))
|
||||
|
||||
self.episodes_so_far += num_episodes
|
||||
|
||||
# Assemble the results.
|
||||
eval_returns = np.array(all_eval_returns)
|
||||
eval_lengths = np.array(all_eval_lengths)
|
||||
noise_indices = np.array(all_noise_indices)
|
||||
noisy_returns = np.array(all_training_returns)
|
||||
noisy_lengths = np.array(all_training_lengths)
|
||||
|
||||
# keep only the best returns
|
||||
# select top performing directions if rollouts_used < num_rollouts
|
||||
max_rewards = np.max(noisy_returns, axis=1)
|
||||
if self.rollouts_used > self.num_rollouts:
|
||||
self.rollouts_used = self.num_rollouts
|
||||
|
||||
percentile = 100 * (1 - (self.rollouts_used / self.num_rollouts))
|
||||
idx = np.arange(max_rewards.size)[
|
||||
max_rewards >= np.percentile(max_rewards, percentile)]
|
||||
noise_idx = noise_indices[idx]
|
||||
noisy_returns = noisy_returns[idx, :]
|
||||
|
||||
# Compute and take a step.
|
||||
g, count = utils.batched_weighted_sum(
|
||||
noisy_returns[:, 0] - noisy_returns[:, 1],
|
||||
(self.noise.get(index, self.policy.num_params)
|
||||
for index in noise_idx),
|
||||
batch_size=min(500, noisy_returns[:, 0].size))
|
||||
g /= noise_idx.size
|
||||
# scale the returns by their standard deviation
|
||||
if not np.isclose(np.std(noisy_returns), 0.0):
|
||||
g /= np.std(noisy_returns)
|
||||
assert (g.shape == (self.policy.num_params, )
|
||||
and g.dtype == np.float32)
|
||||
# Compute the new weights theta.
|
||||
theta, update_ratio = self.optimizer.update(-g)
|
||||
# Set the new weights in the local copy of the policy.
|
||||
self.policy.set_weights(theta)
|
||||
# update the reward list
|
||||
if len(all_eval_returns) > 0:
|
||||
self.reward_list.append(eval_returns.mean())
|
||||
|
||||
# Now sync the filters
|
||||
FilterManager.synchronize({
|
||||
DEFAULT_POLICY_ID: self.policy.get_filter()
|
||||
}, self.workers)
|
||||
|
||||
info = {
|
||||
"weights_norm": np.square(theta).sum(),
|
||||
"weights_std": np.std(theta),
|
||||
"grad_norm": np.square(g).sum(),
|
||||
"update_ratio": update_ratio,
|
||||
"episodes_this_iter": noisy_lengths.size,
|
||||
"episodes_so_far": self.episodes_so_far,
|
||||
}
|
||||
result = dict(
|
||||
episode_reward_mean=np.mean(
|
||||
self.reward_list[-self.report_length:]),
|
||||
episode_len_mean=eval_lengths.mean(),
|
||||
timesteps_this_iter=noisy_lengths.sum(),
|
||||
info=info)
|
||||
|
||||
return result
|
||||
|
||||
@override(Trainer)
|
||||
def _stop(self):
|
||||
# workaround for https://github.com/ray-project/ray/issues/1516
|
||||
for w in self.workers:
|
||||
w.__ray_terminate__.remote()
|
||||
|
||||
@override(Trainer)
|
||||
def compute_action(self, observation):
|
||||
return self.policy.compute(observation, update=True)[0]
|
||||
|
||||
def _collect_results(self, theta_id, min_episodes):
|
||||
num_episodes, num_timesteps = 0, 0
|
||||
results = []
|
||||
while num_episodes < min_episodes:
|
||||
logger.debug(
|
||||
"Collected {} episodes {} timesteps so far this iter".format(
|
||||
num_episodes, num_timesteps))
|
||||
rollout_ids = [
|
||||
worker.do_rollouts.remote(theta_id) for worker in self.workers
|
||||
]
|
||||
# Get the results of the rollouts.
|
||||
for result in ray_get_and_free(rollout_ids):
|
||||
results.append(result)
|
||||
# Update the number of episodes and the number of timesteps
|
||||
# keeping in mind that result.noisy_lengths is a list of lists,
|
||||
# where the inner lists have length 2.
|
||||
num_episodes += sum(len(pair) for pair in result.noisy_lengths)
|
||||
num_timesteps += sum(
|
||||
sum(pair) for pair in result.noisy_lengths)
|
||||
|
||||
return results, num_episodes, num_timesteps
|
||||
|
||||
def __getstate__(self):
|
||||
return {
|
||||
"weights": self.policy.get_weights(),
|
||||
"filter": self.policy.get_filter(),
|
||||
"episodes_so_far": self.episodes_so_far,
|
||||
}
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.episodes_so_far = state["episodes_so_far"]
|
||||
self.policy.set_weights(state["weights"])
|
||||
self.policy.set_filter(state["filter"])
|
||||
FilterManager.synchronize({
|
||||
DEFAULT_POLICY_ID: self.policy.get_filter()
|
||||
}, self.workers)
|
||||
@@ -1,57 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Optimizer(object):
|
||||
def __init__(self, policy):
|
||||
self.policy = policy
|
||||
self.dim = policy.num_params
|
||||
self.t = 0
|
||||
|
||||
def update(self, globalg):
|
||||
self.t += 1
|
||||
step = self._compute_step(globalg)
|
||||
theta = self.policy.get_weights()
|
||||
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
|
||||
return theta + step, ratio
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
def __init__(self, policy, stepsize, momentum=0.0):
|
||||
Optimizer.__init__(self, policy)
|
||||
self.v = np.zeros(self.dim, dtype=np.float32)
|
||||
self.stepsize, self.momentum = stepsize, momentum
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
self.v = self.momentum * self.v + (1. - self.momentum) * globalg
|
||||
step = -self.stepsize * self.v
|
||||
return step
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
def __init__(self, policy, stepsize, beta1=0.9, beta2=0.999,
|
||||
epsilon=1e-08):
|
||||
Optimizer.__init__(self, policy)
|
||||
self.stepsize = stepsize
|
||||
self.beta1 = beta1
|
||||
self.beta2 = beta2
|
||||
self.epsilon = epsilon
|
||||
self.m = np.zeros(self.dim, dtype=np.float32)
|
||||
self.v = np.zeros(self.dim, dtype=np.float32)
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
|
||||
(1 - self.beta1**self.t))
|
||||
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
|
||||
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
|
||||
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
|
||||
return step
|
||||
@@ -1,115 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
import ray.experimental.tf_utils
|
||||
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0):
|
||||
"""Do a rollout.
|
||||
|
||||
If add_noise is True, the rollout will take noisy actions with
|
||||
noise drawn from that stream. Otherwise, no action noise will be added.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
policy: tf object
|
||||
policy from which to draw actions
|
||||
env: GymEnv
|
||||
environment from which to draw rewards, done, and next state
|
||||
timestep_limit: int, optional
|
||||
steps after which to end the rollout
|
||||
add_noise: bool, optional
|
||||
indicates whether exploratory action noise should be added
|
||||
offset: int, optional
|
||||
value to subtract from the reward. For example, survival bonus
|
||||
from humanoid
|
||||
"""
|
||||
env_timestep_limit = env.spec.max_episode_steps
|
||||
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
|
||||
timestep_limit, env_timestep_limit))
|
||||
rews = []
|
||||
t = 0
|
||||
observation = env.reset()
|
||||
for _ in range(timestep_limit or 999999):
|
||||
ac = policy.compute(observation, add_noise=add_noise, update=True)[0]
|
||||
observation, rew, done, _ = env.step(ac)
|
||||
rew -= np.abs(offset)
|
||||
rews.append(rew)
|
||||
t += 1
|
||||
if done:
|
||||
break
|
||||
rews = np.array(rews, dtype=np.float32)
|
||||
return rews, t
|
||||
|
||||
|
||||
class GenericPolicy(object):
|
||||
def __init__(self,
|
||||
sess,
|
||||
action_space,
|
||||
obs_space,
|
||||
preprocessor,
|
||||
observation_filter,
|
||||
model_config,
|
||||
action_noise_std=0.0):
|
||||
self.sess = sess
|
||||
self.action_space = action_space
|
||||
self.action_noise_std = action_noise_std
|
||||
self.preprocessor = preprocessor
|
||||
self.observation_filter = get_filter(observation_filter,
|
||||
self.preprocessor.shape)
|
||||
self.inputs = tf.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
|
||||
# Policy network.
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
action_space, model_config, dist_type="deterministic")
|
||||
|
||||
model = ModelCatalog.get_model({
|
||||
"obs": self.inputs
|
||||
}, obs_space, action_space, dist_dim, model_config)
|
||||
dist = dist_class(model.outputs)
|
||||
self.sampler = dist.sample()
|
||||
|
||||
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
|
||||
model.outputs, self.sess)
|
||||
|
||||
self.num_params = sum(
|
||||
np.prod(variable.shape.as_list())
|
||||
for _, variable in self.variables.variables.items())
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def compute(self, observation, add_noise=False, update=True):
|
||||
observation = self.preprocessor.transform(observation)
|
||||
observation = self.observation_filter(observation[None], update=update)
|
||||
action = self.sess.run(
|
||||
self.sampler, feed_dict={self.inputs: observation})
|
||||
action = _unbatch_tuple_actions(action)
|
||||
if add_noise and isinstance(self.action_space, gym.spaces.Box):
|
||||
action += np.random.randn(*action.shape) * self.action_noise_std
|
||||
return action
|
||||
|
||||
def set_weights(self, x):
|
||||
self.variables.set_flat(x)
|
||||
|
||||
def set_filter(self, obs_filter):
|
||||
self.observation_filter = obs_filter
|
||||
|
||||
def get_filter(self):
|
||||
return self.observation_filter
|
||||
|
||||
def get_weights(self):
|
||||
return self.variables.get_flat()
|
||||
@@ -1,63 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
def compute_ranks(x):
|
||||
"""Returns ranks in [0, len(x))
|
||||
|
||||
Note: This is different from scipy.stats.rankdata, which returns ranks in
|
||||
[1, len(x)].
|
||||
"""
|
||||
assert x.ndim == 1
|
||||
ranks = np.empty(len(x), dtype=int)
|
||||
ranks[x.argsort()] = np.arange(len(x))
|
||||
return ranks
|
||||
|
||||
|
||||
def compute_centered_ranks(x):
|
||||
y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
|
||||
y /= (x.size - 1)
|
||||
y -= 0.5
|
||||
return y
|
||||
|
||||
|
||||
def make_session(single_threaded):
|
||||
if not single_threaded:
|
||||
return tf.Session()
|
||||
return tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
|
||||
|
||||
|
||||
def itergroups(items, group_size):
|
||||
assert group_size >= 1
|
||||
group = []
|
||||
for x in items:
|
||||
group.append(x)
|
||||
if len(group) == group_size:
|
||||
yield tuple(group)
|
||||
del group[:]
|
||||
if group:
|
||||
yield tuple(group)
|
||||
|
||||
|
||||
def batched_weighted_sum(weights, vecs, batch_size):
|
||||
total = 0
|
||||
num_items_summed = 0
|
||||
for batch_weights, batch_vecs in zip(
|
||||
itergroups(weights, batch_size), itergroups(vecs, batch_size)):
|
||||
assert len(batch_weights) == len(batch_vecs) <= batch_size
|
||||
total += np.dot(
|
||||
np.asarray(batch_weights, dtype=np.float32),
|
||||
np.asarray(batch_vecs, dtype=np.float32))
|
||||
num_items_summed += len(batch_weights)
|
||||
return total, num_items_summed
|
||||
@@ -1 +0,0 @@
|
||||
Implementation of deep deterministic policy gradients (https://arxiv.org/abs/1509.02971), including an Ape-X variant.
|
||||
@@ -1,16 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.ddpg.apex import ApexDDPGTrainer
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.agents.ddpg.td3 import TD3Trainer
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
ApexDDPGAgent = renamed_agent(ApexDDPGTrainer)
|
||||
DDPGAgent = renamed_agent(DDPGTrainer)
|
||||
|
||||
__all__ = [
|
||||
"DDPGAgent", "ApexDDPGAgent", "DDPGTrainer", "ApexDDPGTrainer",
|
||||
"TD3Trainer", "DEFAULT_CONFIG"
|
||||
]
|
||||
@@ -1,37 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
|
||||
DEFAULT_CONFIG as DDPG_CONFIG
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
|
||||
DDPG_CONFIG, # see also the options in ddpg.py, which are also supported
|
||||
{
|
||||
"optimizer": merge_dicts(
|
||||
DDPG_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"n_step": 3,
|
||||
"num_gpus": 0,
|
||||
"num_workers": 32,
|
||||
"buffer_size": 2000000,
|
||||
"learning_starts": 50000,
|
||||
"train_batch_size": 512,
|
||||
"sample_batch_size": 50,
|
||||
"target_network_update_freq": 500000,
|
||||
"timesteps_per_iteration": 25000,
|
||||
"per_worker_exploration": True,
|
||||
"worker_side_prioritization": True,
|
||||
"min_iter_time_s": 30,
|
||||
},
|
||||
)
|
||||
|
||||
ApexDDPGTrainer = DDPGTrainer.with_updates(
|
||||
name="APEX_DDPG",
|
||||
default_config=APEX_DDPG_DEFAULT_CONFIG,
|
||||
**APEX_TRAINER_PROPERTIES)
|
||||
@@ -1,222 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer, \
|
||||
update_worker_explorations
|
||||
from ray.rllib.agents.ddpg.ddpg_policy import DDPGTFPolicy
|
||||
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# === Twin Delayed DDPG (TD3) and Soft Actor-Critic (SAC) tricks ===
|
||||
# TD3: https://spinningup.openai.com/en/latest/algorithms/td3.html
|
||||
# In addition to settings below, you can use "exploration_noise_type" and
|
||||
# "exploration_gauss_act_noise" to get IID Gaussian exploration noise
|
||||
# instead of OU exploration noise.
|
||||
# twin Q-net
|
||||
"twin_q": False,
|
||||
# delayed policy update
|
||||
"policy_delay": 1,
|
||||
# target policy smoothing
|
||||
# (this also replaces OU exploration noise with IID Gaussian exploration
|
||||
# noise, for now)
|
||||
"smooth_target_policy": False,
|
||||
# gaussian stddev of target action noise for smoothing
|
||||
"target_noise": 0.2,
|
||||
# target noise limit (bound)
|
||||
"target_noise_clip": 0.5,
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
# Note that evaluation is currently not parallelized, and that for Ape-X
|
||||
# metrics are already only reported for the lowest epsilon workers.
|
||||
"evaluation_interval": None,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# === Model ===
|
||||
# Apply a state preprocessor with spec given by the "model" config option
|
||||
# (like other RL algorithms). This is mostly useful if you have a weird
|
||||
# observation shape, like an image. Auto-enabled if a custom model is set.
|
||||
"use_state_preprocessor": False,
|
||||
# Postprocess the policy network model output with these hidden layers. If
|
||||
# use_state_preprocessor is False, then these will be the *only* hidden
|
||||
# layers in the network.
|
||||
"actor_hiddens": [400, 300],
|
||||
# Hidden layers activation of the postprocessing stage of the policy
|
||||
# network
|
||||
"actor_hidden_activation": "relu",
|
||||
# Postprocess the critic network model output with these hidden layers;
|
||||
# again, if use_state_preprocessor is True, then the state will be
|
||||
# preprocessed by the model specified with the "model" config option first.
|
||||
"critic_hiddens": [400, 300],
|
||||
# Hidden layers activation of the postprocessing state of the critic.
|
||||
"critic_hidden_activation": "relu",
|
||||
# N-step Q learning
|
||||
"n_step": 1,
|
||||
|
||||
# === Exploration ===
|
||||
# Turns on annealing schedule for exploration noise. Exploration is
|
||||
# annealed from 1.0 to exploration_final_eps over schedule_max_timesteps
|
||||
# scaled by exploration_fraction. Original DDPG and TD3 papers do not
|
||||
# anneal noise, so this is False by default.
|
||||
"exploration_should_anneal": False,
|
||||
# Max num timesteps for annealing schedules.
|
||||
"schedule_max_timesteps": 100000,
|
||||
# Number of env steps to optimize for before returning
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
"exploration_fraction": 0.1,
|
||||
# Final scaling multiplier for action noise (initial is 1.0)
|
||||
"exploration_final_scale": 0.02,
|
||||
# valid values: "ou" (time-correlated, like original DDPG paper),
|
||||
# "gaussian" (IID, like TD3 paper)
|
||||
"exploration_noise_type": "ou",
|
||||
# OU-noise scale; this can be used to scale down magnitude of OU noise
|
||||
# before adding to actions (requires "exploration_noise_type" to be "ou")
|
||||
"exploration_ou_noise_scale": 0.1,
|
||||
# theta for OU
|
||||
"exploration_ou_theta": 0.15,
|
||||
# sigma for OU
|
||||
"exploration_ou_sigma": 0.2,
|
||||
# gaussian stddev of act noise for exploration (requires
|
||||
# "exploration_noise_type" to be "gaussian")
|
||||
"exploration_gaussian_sigma": 0.1,
|
||||
# If True parameter space noise will be used for exploration
|
||||
# See https://blog.openai.com/better-exploration-with-parameter-noise/
|
||||
"parameter_noise": False,
|
||||
# Until this many timesteps have elapsed, the agent's policy will be
|
||||
# ignored & it will instead take uniform random actions. Can be used in
|
||||
# conjunction with learning_starts (which controls when the first
|
||||
# optimization step happens) to decrease dependence of exploration &
|
||||
# optimization on initial policy parameters. Note that this will be
|
||||
# disabled when the action noise scale is set to 0 (e.g during evaluation).
|
||||
"pure_exploration_steps": 1000,
|
||||
# Extra configuration that disables exploration.
|
||||
"evaluation_config": {
|
||||
"exploration_fraction": 0,
|
||||
"exploration_final_eps": 0,
|
||||
},
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
# each worker will have a replay buffer of this size.
|
||||
"buffer_size": 50000,
|
||||
# If True prioritized replay buffer will be used.
|
||||
"prioritized_replay": True,
|
||||
# Alpha parameter for prioritized replay buffer.
|
||||
"prioritized_replay_alpha": 0.6,
|
||||
# Beta parameter for sampling from prioritized replay buffer.
|
||||
"prioritized_replay_beta": 0.4,
|
||||
# Fraction of entire training period over which the beta parameter is
|
||||
# annealed
|
||||
"beta_annealing_fraction": 0.2,
|
||||
# Final value of beta
|
||||
"final_prioritized_replay_beta": 0.4,
|
||||
# Epsilon to add to the TD errors when updating priorities.
|
||||
"prioritized_replay_eps": 1e-6,
|
||||
# Whether to LZ4 compress observations
|
||||
"compress_observations": False,
|
||||
|
||||
# === Optimization ===
|
||||
# Learning rate for the critic (Q-function) optimizer.
|
||||
"critic_lr": 1e-3,
|
||||
# Learning rate for the actor (policy) optimizer.
|
||||
"actor_lr": 1e-3,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
"target_network_update_freq": 0,
|
||||
# Update the target by \tau * policy + (1-\tau) * target_policy
|
||||
"tau": 0.002,
|
||||
# If True, use huber loss instead of squared loss for critic network
|
||||
# Conventionally, no need to clip gradients if using a huber loss
|
||||
"use_huber": False,
|
||||
# Threshold of a huber loss
|
||||
"huber_threshold": 1.0,
|
||||
# Weights for L2 regularization
|
||||
"l2_reg": 1e-6,
|
||||
# If not None, clip gradients during optimization at this value
|
||||
"grad_norm_clipping": None,
|
||||
# How many steps of the model to sample before learning starts.
|
||||
"learning_starts": 1500,
|
||||
# Update the replay buffer with this many samples at once. Note that this
|
||||
# setting applies per-worker if num_workers > 1.
|
||||
"sample_batch_size": 1,
|
||||
# Size of a batched sampled from replay buffer for training. Note that
|
||||
# if async_updates is set, then each worker returns gradients for a
|
||||
# batch of this size.
|
||||
"train_batch_size": 256,
|
||||
|
||||
# === Parallelism ===
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you're using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
"per_worker_exploration": False,
|
||||
# Whether to compute priorities on workers.
|
||||
"worker_side_prioritization": False,
|
||||
# Prevent iterations from going lower than this time span
|
||||
"min_iter_time_s": 1,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def make_exploration_schedule(config, worker_index):
|
||||
# Modification of DQN's schedule to take into account
|
||||
# `exploration_ou_noise_scale`
|
||||
if config["per_worker_exploration"]:
|
||||
assert config["num_workers"] > 1, "This requires multiple workers"
|
||||
if worker_index >= 0:
|
||||
# Exploration constants from the Ape-X paper
|
||||
max_index = float(config["num_workers"] - 1)
|
||||
exponent = 1 + worker_index / max_index * 7
|
||||
return ConstantSchedule(0.4**exponent)
|
||||
else:
|
||||
# local ev should have zero exploration so that eval rollouts
|
||||
# run properly
|
||||
return ConstantSchedule(0.0)
|
||||
elif config["exploration_should_anneal"]:
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(config["exploration_fraction"] *
|
||||
config["schedule_max_timesteps"]),
|
||||
initial_p=1.0,
|
||||
final_p=config["exploration_final_scale"])
|
||||
else:
|
||||
# *always* add exploration noise
|
||||
return ConstantSchedule(1.0)
|
||||
|
||||
|
||||
def setup_ddpg_exploration(trainer):
|
||||
trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
|
||||
trainer.explorations = [
|
||||
make_exploration_schedule(trainer.config, i)
|
||||
for i in range(trainer.config["num_workers"])
|
||||
]
|
||||
|
||||
|
||||
def add_pure_exploration_phase(trainer):
|
||||
global_timestep = trainer.optimizer.num_steps_sampled
|
||||
pure_expl_steps = trainer.config["pure_exploration_steps"]
|
||||
if pure_expl_steps:
|
||||
# tell workers whether they should do pure exploration
|
||||
only_explore = global_timestep < pure_expl_steps
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.set_pure_exploration_phase(only_explore))
|
||||
for e in trainer.workers.remote_workers():
|
||||
e.foreach_trainable_policy.remote(
|
||||
lambda p, _: p.set_pure_exploration_phase(only_explore))
|
||||
update_worker_explorations(trainer)
|
||||
|
||||
|
||||
DDPGTrainer = GenericOffPolicyTrainer.with_updates(
|
||||
name="DDPG",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=DDPGTFPolicy,
|
||||
before_init=setup_ddpg_exploration,
|
||||
before_train_step=add_pure_exploration_phase)
|
||||
@@ -1,246 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class DDPGModel(TFModelV2):
|
||||
"""Extension of standard TFModel for DDPG.
|
||||
|
||||
Data flow:
|
||||
obs -> forward() -> model_out
|
||||
model_out -> get_policy_output() -> pi(s)
|
||||
model_out, actions -> get_q_values() -> Q(s, a)
|
||||
model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
|
||||
|
||||
Note that this class by itself is not a valid model unless you
|
||||
implement forward() in a subclass."""
|
||||
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
actor_hidden_activation="relu",
|
||||
actor_hiddens=(400, 300),
|
||||
critic_hidden_activation="relu",
|
||||
critic_hiddens=(400, 300),
|
||||
parameter_noise=False,
|
||||
twin_q=False,
|
||||
exploration_ou_sigma=0.2):
|
||||
"""Initialize variables of this model.
|
||||
|
||||
Extra model kwargs:
|
||||
actor_hidden_activation (str): activation for actor network
|
||||
actor_hiddens (list): hidden layers sizes for actor network
|
||||
critic_hidden_activation (str): activation for critic network
|
||||
critic_hiddens (list): hidden layers sizes for critic network
|
||||
parameter_noise (bool): use param noise exploration
|
||||
twin_q (bool): build twin Q networks
|
||||
exploration_ou_sigma (float): ou noise sigma for exploration
|
||||
|
||||
Note that the core layers for forward() are not defined here, this
|
||||
only defines the layers for the output heads. Those layers for
|
||||
forward() should be defined in subclasses of DDPGModel.
|
||||
"""
|
||||
|
||||
super(DDPGModel, self).__init__(obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
self.exploration_ou_sigma = exploration_ou_sigma
|
||||
|
||||
self.action_dim = np.product(action_space.shape)
|
||||
self.model_out = tf.keras.layers.Input(
|
||||
shape=(num_outputs, ), name="model_out")
|
||||
self.actions = tf.keras.layers.Input(
|
||||
shape=(self.action_dim, ), name="actions")
|
||||
|
||||
def build_action_net(action_out):
|
||||
activation = getattr(tf.nn, actor_hidden_activation)
|
||||
i = 0
|
||||
for hidden in actor_hiddens:
|
||||
if parameter_noise:
|
||||
import tensorflow.contrib.layers as layers
|
||||
action_out = layers.fully_connected(
|
||||
action_out,
|
||||
num_outputs=hidden,
|
||||
activation_fn=activation,
|
||||
normalizer_fn=layers.layer_norm)
|
||||
else:
|
||||
action_out = tf.layers.dense(
|
||||
action_out,
|
||||
units=hidden,
|
||||
activation=activation,
|
||||
name="action_hidden_{}".format(i))
|
||||
i += 1
|
||||
return tf.layers.dense(
|
||||
action_out,
|
||||
units=self.action_dim,
|
||||
activation=None,
|
||||
name="action_out")
|
||||
|
||||
action_scope = name + "/action_net"
|
||||
|
||||
# TODO(ekl) use keras layers instead of variable scopes
|
||||
def build_action_net_scope(model_out):
|
||||
with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
|
||||
return build_action_net(model_out)
|
||||
|
||||
pi_out = tf.keras.layers.Lambda(build_action_net_scope)(self.model_out)
|
||||
self.action_net = tf.keras.Model(self.model_out, pi_out)
|
||||
self.register_variables(self.action_net.variables)
|
||||
|
||||
# Noise vars for P network except for layer normalization vars
|
||||
if parameter_noise:
|
||||
with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
|
||||
self._build_parameter_noise([
|
||||
var for var in self.action_net.variables
|
||||
if "LayerNorm" not in var.name
|
||||
])
|
||||
|
||||
def build_q_net(name, model_out, actions):
|
||||
q_out = tf.keras.layers.Concatenate(axis=1)([model_out, actions])
|
||||
activation = getattr(tf.nn, critic_hidden_activation)
|
||||
for i, n in enumerate(critic_hiddens):
|
||||
q_out = tf.keras.layers.Dense(
|
||||
n,
|
||||
name="{}_hidden_{}".format(name, i),
|
||||
activation=activation)(q_out)
|
||||
q_out = tf.keras.layers.Dense(
|
||||
1, activation=None, name="{}_out".format(name))(q_out)
|
||||
return tf.keras.Model([model_out, actions], q_out)
|
||||
|
||||
self.q_net = build_q_net("q", self.model_out, self.actions)
|
||||
self.register_variables(self.q_net.variables)
|
||||
|
||||
if twin_q:
|
||||
self.twin_q_net = build_q_net("twin_q", self.model_out,
|
||||
self.actions)
|
||||
self.register_variables(self.twin_q_net.variables)
|
||||
else:
|
||||
self.twin_q_net = None
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
"""This generates the model_out tensor input.
|
||||
|
||||
You must implement this as documented in modelv2.py."""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_policy_output(self, model_out):
|
||||
"""Return the (unscaled) output of the policy network.
|
||||
|
||||
This returns the unscaled outputs of pi(s).
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): obs embeddings from the model layers, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
|
||||
Returns:
|
||||
tensor of shape [BATCH_SIZE, action_dim] with range [-inf, inf].
|
||||
"""
|
||||
return self.action_net(model_out)
|
||||
|
||||
def get_q_values(self, model_out, actions):
|
||||
"""Return the Q estimates for the most recent forward pass.
|
||||
|
||||
This implements Q(s, a).
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): obs embeddings from the model layers, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
actions (Tensor): action values that correspond with the most
|
||||
recent batch of observations passed through forward(), of shape
|
||||
[BATCH_SIZE, action_dim].
|
||||
|
||||
Returns:
|
||||
tensor of shape [BATCH_SIZE].
|
||||
"""
|
||||
return self.q_net([model_out, actions])
|
||||
|
||||
def get_twin_q_values(self, model_out, actions):
|
||||
"""Same as get_q_values but using the twin Q net.
|
||||
|
||||
This implements the twin Q(s, a).
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): obs embeddings from the model layers, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
actions (Tensor): action values that correspond with the most
|
||||
recent batch of observations passed through forward(), of shape
|
||||
[BATCH_SIZE, action_dim].
|
||||
|
||||
Returns:
|
||||
tensor of shape [BATCH_SIZE].
|
||||
"""
|
||||
return self.twin_q_net([model_out, actions])
|
||||
|
||||
def policy_variables(self):
|
||||
"""Return the list of variables for the policy net."""
|
||||
|
||||
return list(self.action_net.variables)
|
||||
|
||||
def q_variables(self):
|
||||
"""Return the list of variables for Q / twin Q nets."""
|
||||
|
||||
return self.q_net.variables + (self.twin_q_net.variables
|
||||
if self.twin_q_net else [])
|
||||
|
||||
def update_action_noise(self, session, distance_in_action_space,
|
||||
exploration_ou_sigma, cur_noise_scale):
|
||||
"""Update the model action noise settings.
|
||||
|
||||
This is called internally by the DDPG policy."""
|
||||
|
||||
self.pi_distance = distance_in_action_space
|
||||
if (distance_in_action_space < exploration_ou_sigma * cur_noise_scale):
|
||||
# multiplying the sampled OU noise by noise scale is
|
||||
# equivalent to multiplying the sigma of OU by noise scale
|
||||
self.parameter_noise_sigma_val *= 1.01
|
||||
else:
|
||||
self.parameter_noise_sigma_val /= 1.01
|
||||
self.parameter_noise_sigma.load(
|
||||
self.parameter_noise_sigma_val, session=session)
|
||||
|
||||
def _build_parameter_noise(self, pnet_params):
|
||||
assert pnet_params
|
||||
self.parameter_noise_sigma_val = self.exploration_ou_sigma
|
||||
self.parameter_noise_sigma = tf.get_variable(
|
||||
initializer=tf.constant_initializer(
|
||||
self.parameter_noise_sigma_val),
|
||||
name="parameter_noise_sigma",
|
||||
shape=(),
|
||||
trainable=False,
|
||||
dtype=tf.float32)
|
||||
self.parameter_noise = []
|
||||
# No need to add any noise on LayerNorm parameters
|
||||
for var in pnet_params:
|
||||
noise_var = tf.get_variable(
|
||||
name=var.name.split(":")[0] + "_noise",
|
||||
shape=var.shape,
|
||||
initializer=tf.constant_initializer(.0),
|
||||
trainable=False)
|
||||
self.parameter_noise.append(noise_var)
|
||||
remove_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
remove_noise_ops.append(tf.assign_add(var, -var_noise))
|
||||
self.remove_noise_op = tf.group(*tuple(remove_noise_ops))
|
||||
generate_noise_ops = list()
|
||||
for var_noise in self.parameter_noise:
|
||||
generate_noise_ops.append(
|
||||
tf.assign(
|
||||
var_noise,
|
||||
tf.random_normal(
|
||||
shape=var_noise.shape,
|
||||
stddev=self.parameter_noise_sigma)))
|
||||
with tf.control_dependencies(generate_noise_ops):
|
||||
add_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, self.parameter_noise):
|
||||
add_noise_ops.append(tf.assign_add(var, var_noise))
|
||||
self.add_noise_op = tf.group(*tuple(add_noise_ops))
|
||||
self.pi_distance = None
|
||||
@@ -1,507 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Box
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import ray
|
||||
import ray.experimental.tf_utils
|
||||
from ray.rllib.agents.ddpg.ddpg_model import DDPGModel
|
||||
from ray.rllib.agents.ddpg.noop_model import NoopModel
|
||||
from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.policy.tf_policy import TFPolicy
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip
|
||||
|
||||
tf = try_import_tf()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_ddpg_model(policy, obs_space, action_space, config):
|
||||
if config["model"]["custom_model"]:
|
||||
logger.warning(
|
||||
"Setting use_state_preprocessor=True since a custom model "
|
||||
"was specified.")
|
||||
config["use_state_preprocessor"] = True
|
||||
if not isinstance(action_space, Box):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DDPG.".format(action_space))
|
||||
if len(action_space.shape) > 1:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space has multiple dimensions "
|
||||
"{}. ".format(action_space.shape) +
|
||||
"Consider reshaping this into a single dimension, "
|
||||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
if config["use_state_preprocessor"]:
|
||||
default_model = None # catalog decides
|
||||
num_outputs = 256 # arbitrary
|
||||
config["model"]["no_final_linear"] = True
|
||||
else:
|
||||
default_model = NoopModel
|
||||
num_outputs = int(np.product(obs_space.shape))
|
||||
|
||||
policy.model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
model_interface=DDPGModel,
|
||||
default_model=default_model,
|
||||
name="ddpg_model",
|
||||
actor_hidden_activation=config["actor_hidden_activation"],
|
||||
actor_hiddens=config["actor_hiddens"],
|
||||
critic_hidden_activation=config["critic_hidden_activation"],
|
||||
critic_hiddens=config["critic_hiddens"],
|
||||
parameter_noise=config["parameter_noise"],
|
||||
twin_q=config["twin_q"])
|
||||
|
||||
policy.target_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
model_interface=DDPGModel,
|
||||
default_model=default_model,
|
||||
name="target_ddpg_model",
|
||||
actor_hidden_activation=config["actor_hidden_activation"],
|
||||
actor_hiddens=config["actor_hiddens"],
|
||||
critic_hidden_activation=config["critic_hidden_activation"],
|
||||
critic_hiddens=config["critic_hiddens"],
|
||||
parameter_noise=config["parameter_noise"],
|
||||
twin_q=config["twin_q"])
|
||||
|
||||
return policy.model
|
||||
|
||||
|
||||
def postprocess_trajectory(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
if policy.config["parameter_noise"]:
|
||||
policy.adjust_param_noise_sigma(sample_batch)
|
||||
return _postprocess_dqn(policy, sample_batch)
|
||||
|
||||
|
||||
def exploration_setting_inputs(policy):
|
||||
return {
|
||||
policy.stochastic: True,
|
||||
policy.noise_scale: policy.cur_noise_scale,
|
||||
policy.pure_exploration_phase: policy.cur_pure_exploration_phase,
|
||||
}
|
||||
|
||||
|
||||
def build_action_output(policy, model, input_dict, obs_space, action_space,
|
||||
config):
|
||||
model_out, _ = model({
|
||||
"obs": input_dict[SampleBatch.CUR_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
action_out = model.get_policy_output(model_out)
|
||||
|
||||
# Use sigmoid to scale to [0,1], but also double magnitude of input to
|
||||
# emulate behaviour of tanh activation used in DDPG and TD3 papers.
|
||||
sigmoid_out = tf.nn.sigmoid(2 * action_out)
|
||||
# Rescale to actual env policy scale
|
||||
# (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
|
||||
# get same dims)
|
||||
action_range = (action_space.high - action_space.low)[None]
|
||||
low_action = action_space.low[None]
|
||||
deterministic_actions = action_range * sigmoid_out + low_action
|
||||
|
||||
noise_type = config["exploration_noise_type"]
|
||||
action_low = action_space.low
|
||||
action_high = action_space.high
|
||||
action_range = action_space.high - action_low
|
||||
|
||||
def compute_stochastic_actions():
|
||||
def make_noisy_actions():
|
||||
# shape of deterministic_actions is [None, dim_action]
|
||||
if noise_type == "gaussian":
|
||||
# add IID Gaussian noise for exploration, TD3-style
|
||||
normal_sample = policy.noise_scale * tf.random_normal(
|
||||
tf.shape(deterministic_actions),
|
||||
stddev=config["exploration_gaussian_sigma"])
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + normal_sample,
|
||||
action_low * tf.ones_like(deterministic_actions),
|
||||
action_high * tf.ones_like(deterministic_actions))
|
||||
elif noise_type == "ou":
|
||||
# add OU noise for exploration, DDPG-style
|
||||
zero_acts = action_low.size * [.0]
|
||||
exploration_sample = tf.get_variable(
|
||||
name="ornstein_uhlenbeck",
|
||||
dtype=tf.float32,
|
||||
initializer=zero_acts,
|
||||
trainable=False)
|
||||
normal_sample = tf.random_normal(
|
||||
shape=[action_low.size], mean=0.0, stddev=1.0)
|
||||
ou_new = config["exploration_ou_theta"] \
|
||||
* -exploration_sample \
|
||||
+ config["exploration_ou_sigma"] * normal_sample
|
||||
exploration_value = tf.assign_add(exploration_sample, ou_new)
|
||||
base_scale = config["exploration_ou_noise_scale"]
|
||||
noise = policy.noise_scale * base_scale \
|
||||
* exploration_value * action_range
|
||||
stochastic_actions = tf.clip_by_value(
|
||||
deterministic_actions + noise,
|
||||
action_low * tf.ones_like(deterministic_actions),
|
||||
action_high * tf.ones_like(deterministic_actions))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown noise type '%s' (try 'ou' or 'gaussian')" %
|
||||
noise_type)
|
||||
return stochastic_actions
|
||||
|
||||
def make_uniform_random_actions():
|
||||
# pure random exploration option
|
||||
uniform_random_actions = tf.random_uniform(
|
||||
tf.shape(deterministic_actions))
|
||||
# rescale uniform random actions according to action range
|
||||
tf_range = tf.constant(action_range[None], dtype="float32")
|
||||
tf_low = tf.constant(action_low[None], dtype="float32")
|
||||
uniform_random_actions = uniform_random_actions * tf_range \
|
||||
+ tf_low
|
||||
return uniform_random_actions
|
||||
|
||||
stochastic_actions = tf.cond(
|
||||
# need to condition on noise_scale > 0 because zeroing
|
||||
# noise_scale is how a worker signals no noise should be used
|
||||
# (this is ugly and should be fixed by adding an "eval_mode"
|
||||
# config flag or something)
|
||||
tf.logical_and(policy.pure_exploration_phase,
|
||||
policy.noise_scale > 0),
|
||||
true_fn=make_uniform_random_actions,
|
||||
false_fn=make_noisy_actions)
|
||||
return stochastic_actions
|
||||
|
||||
enable_stochastic = tf.logical_and(policy.stochastic,
|
||||
not config["parameter_noise"])
|
||||
actions = tf.cond(enable_stochastic, compute_stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
policy.output_actions = actions
|
||||
return actions, None
|
||||
|
||||
|
||||
def actor_critic_loss(policy, batch_tensors):
|
||||
model_out_t, _ = policy.model({
|
||||
"obs": batch_tensors[SampleBatch.CUR_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
model_out_tp1, _ = policy.model({
|
||||
"obs": batch_tensors[SampleBatch.NEXT_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
target_model_out_tp1, _ = policy.target_model({
|
||||
"obs": batch_tensors[SampleBatch.NEXT_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
policy_t = policy.model.get_policy_output(model_out_t)
|
||||
policy_tp1 = policy.model.get_policy_output(model_out_tp1)
|
||||
|
||||
if policy.config["smooth_target_policy"]:
|
||||
target_noise_clip = policy.config["target_noise_clip"]
|
||||
clipped_normal_sample = tf.clip_by_value(
|
||||
tf.random_normal(
|
||||
tf.shape(policy_tp1), stddev=policy.config["target_noise"]),
|
||||
-target_noise_clip, target_noise_clip)
|
||||
policy_tp1_smoothed = tf.clip_by_value(
|
||||
policy_tp1 + clipped_normal_sample,
|
||||
policy.action_space.low * tf.ones_like(policy_tp1),
|
||||
policy.action_space.high * tf.ones_like(policy_tp1))
|
||||
else:
|
||||
policy_tp1_smoothed = policy_tp1
|
||||
|
||||
# q network evaluation
|
||||
q_t = policy.model.get_q_values(model_out_t,
|
||||
batch_tensors[SampleBatch.ACTIONS])
|
||||
if policy.config["twin_q"]:
|
||||
twin_q_t = policy.model.get_twin_q_values(
|
||||
model_out_t, batch_tensors[SampleBatch.ACTIONS])
|
||||
|
||||
# Q-values for current policy (no noise) in given current state
|
||||
q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t)
|
||||
|
||||
# target q network evaluation
|
||||
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
|
||||
policy_tp1_smoothed)
|
||||
if policy.config["twin_q"]:
|
||||
twin_q_tp1 = policy.target_model.get_twin_q_values(
|
||||
target_model_out_tp1, policy_tp1_smoothed)
|
||||
|
||||
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
|
||||
if policy.config["twin_q"]:
|
||||
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
|
||||
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
|
||||
|
||||
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES],
|
||||
tf.float32)) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = tf.stop_gradient(
|
||||
batch_tensors[SampleBatch.REWARDS] +
|
||||
policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
if policy.config["twin_q"]:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
twin_td_error = twin_q_t_selected - q_t_selected_target
|
||||
td_error = td_error + twin_td_error
|
||||
if policy.config["use_huber"]:
|
||||
errors = huber_loss(td_error, policy.config["huber_threshold"]) \
|
||||
+ huber_loss(twin_td_error, policy.config["huber_threshold"])
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error) + 0.5 * tf.square(twin_td_error)
|
||||
else:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
if policy.config["use_huber"]:
|
||||
errors = huber_loss(td_error, policy.config["huber_threshold"])
|
||||
else:
|
||||
errors = 0.5 * tf.square(td_error)
|
||||
|
||||
critic_loss = policy.model.custom_loss(
|
||||
tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors)
|
||||
actor_loss = -tf.reduce_mean(q_t_det_policy)
|
||||
|
||||
if policy.config["l2_reg"] is not None:
|
||||
for var in policy.model.policy_variables():
|
||||
if "bias" not in var.name:
|
||||
actor_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var)
|
||||
for var in policy.model.q_variables():
|
||||
if "bias" not in var.name:
|
||||
critic_loss += policy.config["l2_reg"] * tf.nn.l2_loss(var)
|
||||
|
||||
# save for stats function
|
||||
policy.q_t = q_t
|
||||
policy.td_error = td_error
|
||||
policy.actor_loss = actor_loss
|
||||
policy.critic_loss = critic_loss
|
||||
|
||||
# in a custom apply op we handle the losses separately, but return them
|
||||
# combined in one loss for now
|
||||
return actor_loss + critic_loss
|
||||
|
||||
|
||||
def gradients(policy, optimizer, loss):
|
||||
if policy.config["grad_norm_clipping"] is not None:
|
||||
actor_grads_and_vars = minimize_and_clip(
|
||||
policy._actor_optimizer,
|
||||
policy.actor_loss,
|
||||
var_list=policy.model.policy_variables(),
|
||||
clip_val=policy.config["grad_norm_clipping"])
|
||||
critic_grads_and_vars = minimize_and_clip(
|
||||
policy._critic_optimizer,
|
||||
policy.critic_loss,
|
||||
var_list=policy.model.q_variables(),
|
||||
clip_val=policy.config["grad_norm_clipping"])
|
||||
else:
|
||||
actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
|
||||
policy.actor_loss, var_list=policy.model.policy_variables())
|
||||
critic_grads_and_vars = policy._critic_optimizer.compute_gradients(
|
||||
policy.critic_loss, var_list=policy.model.q_variables())
|
||||
# save these for later use in build_apply_op
|
||||
policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
|
||||
if g is not None]
|
||||
policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
|
||||
if g is not None]
|
||||
grads_and_vars = (
|
||||
policy._actor_grads_and_vars + policy._critic_grads_and_vars)
|
||||
return grads_and_vars
|
||||
|
||||
|
||||
def apply_gradients(policy, optimizer, grads_and_vars):
|
||||
# for policy gradient, update policy net one time v.s.
|
||||
# update critic net `policy_delay` time(s)
|
||||
should_apply_actor_opt = tf.equal(
|
||||
tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
|
||||
|
||||
def make_apply_op():
|
||||
return policy._actor_optimizer.apply_gradients(
|
||||
policy._actor_grads_and_vars)
|
||||
|
||||
actor_op = tf.cond(
|
||||
should_apply_actor_opt,
|
||||
true_fn=make_apply_op,
|
||||
false_fn=lambda: tf.no_op())
|
||||
critic_op = policy._critic_optimizer.apply_gradients(
|
||||
policy._critic_grads_and_vars)
|
||||
|
||||
# increment global step & apply ops
|
||||
with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
|
||||
return tf.group(actor_op, critic_op)
|
||||
|
||||
|
||||
def stats(policy, batch_tensors):
|
||||
return {
|
||||
"td_error": tf.reduce_mean(policy.td_error),
|
||||
"actor_loss": tf.reduce_mean(policy.actor_loss),
|
||||
"critic_loss": tf.reduce_mean(policy.critic_loss),
|
||||
"mean_q": tf.reduce_mean(policy.q_t),
|
||||
"max_q": tf.reduce_max(policy.q_t),
|
||||
"min_q": tf.reduce_min(policy.q_t),
|
||||
}
|
||||
|
||||
|
||||
class ExplorationStateMixin(object):
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
self.cur_noise_scale = 1.0
|
||||
self.cur_pure_exploration_phase = False
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.noise_scale = tf.placeholder(tf.float32, (), name="noise_scale")
|
||||
self.pure_exploration_phase = tf.placeholder(
|
||||
tf.bool, (), name="pure_exploration_phase")
|
||||
|
||||
def add_parameter_noise(self):
|
||||
if self.config["parameter_noise"]:
|
||||
self.get_session().run(self.model.add_noise_op)
|
||||
|
||||
def adjust_param_noise_sigma(self, sample_batch):
|
||||
# adjust the sigma of parameter space noise
|
||||
states, noisy_actions = [
|
||||
list(x) for x in sample_batch.columns(
|
||||
[SampleBatch.CUR_OBS, SampleBatch.ACTIONS])
|
||||
]
|
||||
self.get_session().run(self.model.remove_noise_op)
|
||||
clean_actions = self.get_session().run(
|
||||
self.output_actions,
|
||||
feed_dict={
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): states,
|
||||
self.stochastic: False,
|
||||
self.noise_scale: .0,
|
||||
self.pure_exploration_phase: False,
|
||||
})
|
||||
distance_in_action_space = np.sqrt(
|
||||
np.mean(np.square(clean_actions - noisy_actions)))
|
||||
self.model.update_action_noise(
|
||||
self.get_session(), distance_in_action_space,
|
||||
self.config["exploration_ou_sigma"], self.cur_noise_scale)
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
# set_epsilon is called by optimizer to anneal exploration as
|
||||
# necessary, and to turn it off during evaluation. The "epsilon" part
|
||||
# is a carry-over from DQN, which uses epsilon-greedy exploration
|
||||
# rather than adding action noise to the output of a policy network.
|
||||
self.cur_noise_scale = epsilon
|
||||
|
||||
def set_pure_exploration_phase(self, pure_exploration_phase):
|
||||
self.cur_pure_exploration_phase = pure_exploration_phase
|
||||
|
||||
@override(Policy)
|
||||
def get_state(self):
|
||||
return [
|
||||
TFPolicy.get_state(self), self.cur_noise_scale,
|
||||
self.cur_pure_exploration_phase
|
||||
]
|
||||
|
||||
@override(Policy)
|
||||
def set_state(self, state):
|
||||
TFPolicy.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
self.set_pure_exploration_phase(state[2])
|
||||
|
||||
|
||||
class TargetNetworkMixin(object):
|
||||
def __init__(self, config):
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
self.tau_value = config.get("tau")
|
||||
self.tau = tf.placeholder(tf.float32, (), name="tau")
|
||||
update_target_expr = []
|
||||
model_vars = self.model.trainable_variables()
|
||||
target_model_vars = self.target_model.trainable_variables()
|
||||
assert len(model_vars) == len(target_model_vars), \
|
||||
(model_vars, target_model_vars)
|
||||
for var, var_target in zip(model_vars, target_model_vars):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
logger.debug("Update target op {}".format(var_target))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
# support both hard and soft sync
|
||||
def update_target(self, tau=None):
|
||||
tau = tau or self.tau_value
|
||||
return self.get_session().run(
|
||||
self.update_target_expr, feed_dict={self.tau: tau})
|
||||
|
||||
|
||||
class ActorCriticOptimizerMixin(object):
|
||||
def __init__(self, config):
|
||||
# create global step for counting the number of update operations
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
|
||||
# use separate optimizers for actor & critic
|
||||
self._actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["actor_lr"])
|
||||
self._critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["critic_lr"])
|
||||
|
||||
|
||||
class ComputeTDErrorMixin(object):
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
if not self.loss_initialized():
|
||||
return np.zeros_like(rew_t)
|
||||
|
||||
td_err = self.get_session().run(
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): [
|
||||
np.array(ob) for ob in obs_t
|
||||
],
|
||||
self.get_placeholder(SampleBatch.ACTIONS): act_t,
|
||||
self.get_placeholder(SampleBatch.REWARDS): rew_t,
|
||||
self.get_placeholder(SampleBatch.NEXT_OBS): [
|
||||
np.array(ob) for ob in obs_tp1
|
||||
],
|
||||
self.get_placeholder(SampleBatch.DONES): done_mask,
|
||||
self.get_placeholder(PRIO_WEIGHTS): importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
|
||||
def setup_early_mixins(policy, obs_space, action_space, config):
|
||||
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
|
||||
ActorCriticOptimizerMixin.__init__(policy, config)
|
||||
|
||||
|
||||
def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, config)
|
||||
|
||||
|
||||
DDPGTFPolicy = build_tf_policy(
|
||||
name="DDPGTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG,
|
||||
make_model=build_ddpg_model,
|
||||
postprocess_fn=postprocess_trajectory,
|
||||
extra_action_feed_fn=exploration_setting_inputs,
|
||||
action_sampler_fn=build_action_output,
|
||||
loss_fn=actor_critic_loss,
|
||||
stats_fn=stats,
|
||||
gradients_fn=gradients,
|
||||
apply_gradients_fn=apply_gradients,
|
||||
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
|
||||
mixins=[
|
||||
TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
|
||||
ComputeTDErrorMixin
|
||||
],
|
||||
before_init=setup_early_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
obs_include_prev_action_reward=False)
|
||||
@@ -1,20 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.models import Model
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class NoopModel(Model):
|
||||
"""Trivial model that just returns the obs flattened.
|
||||
|
||||
This is the model used if use_state_preprocessor=False."""
|
||||
|
||||
@override(Model)
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
out = tf.reshape(input_dict["obs"], [-1, num_outputs])
|
||||
return out, out
|
||||
@@ -1,57 +0,0 @@
|
||||
"""A more stable successor to TD3.
|
||||
|
||||
By default, this uses a near-identical configuration to that reported in the
|
||||
TD3 paper.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.ddpg.ddpg import DDPGTrainer, \
|
||||
DEFAULT_CONFIG as DDPG_CONFIG
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
TD3_DEFAULT_CONFIG = merge_dicts(
|
||||
DDPG_CONFIG,
|
||||
{
|
||||
# largest changes: twin Q functions, delayed policy updates, and target
|
||||
# smoothing
|
||||
"twin_q": True,
|
||||
"policy_delay": 2,
|
||||
"smooth_target_policy": True,
|
||||
"target_noise": 0.2,
|
||||
"target_noise_clip": 0.5,
|
||||
|
||||
# other changes & things we want to keep fixed: IID Gaussian
|
||||
# exploration noise, larger actor learning rate, no l2 regularisation,
|
||||
# no Huber loss, etc.
|
||||
"exploration_should_anneal": False,
|
||||
"exploration_noise_type": "gaussian",
|
||||
"exploration_gaussian_sigma": 0.1,
|
||||
"learning_starts": 10000,
|
||||
"pure_exploration_steps": 10000,
|
||||
"actor_hiddens": [400, 300],
|
||||
"critic_hiddens": [400, 300],
|
||||
"n_step": 1,
|
||||
"gamma": 0.99,
|
||||
"actor_lr": 1e-3,
|
||||
"critic_lr": 1e-3,
|
||||
"l2_reg": 0.0,
|
||||
"tau": 5e-3,
|
||||
"train_batch_size": 100,
|
||||
"use_huber": False,
|
||||
"target_network_update_freq": 0,
|
||||
"num_workers": 0,
|
||||
"num_gpus_per_worker": 0,
|
||||
"per_worker_exploration": False,
|
||||
"worker_side_prioritization": False,
|
||||
"buffer_size": 1000000,
|
||||
"prioritized_replay": False,
|
||||
"clip_rewards": False,
|
||||
"use_state_preprocessor": False,
|
||||
},
|
||||
)
|
||||
|
||||
TD3Trainer = DDPGTrainer.with_updates(
|
||||
name="TD3", default_config=TD3_DEFAULT_CONFIG)
|
||||
@@ -1 +0,0 @@
|
||||
Code in this package is adapted from https://github.com/openai/baselines/tree/master/baselines/deepq.
|
||||
@@ -1,15 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.dqn.apex import ApexTrainer
|
||||
from ray.rllib.agents.dqn.dqn import DQNTrainer, SimpleQTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
DQNAgent = renamed_agent(DQNTrainer)
|
||||
ApexAgent = renamed_agent(ApexTrainer)
|
||||
|
||||
__all__ = [
|
||||
"DQNAgent", "ApexAgent", "ApexTrainer", "DQNTrainer", "DEFAULT_CONFIG",
|
||||
"SimpleQTrainer"
|
||||
]
|
||||
@@ -1,84 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.dqn.dqn import DQNTrainer, DEFAULT_CONFIG as DQN_CONFIG
|
||||
from ray.rllib.optimizers import AsyncReplayOptimizer
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
APEX_DEFAULT_CONFIG = merge_dicts(
|
||||
DQN_CONFIG, # see also the options in dqn.py, which are also supported
|
||||
{
|
||||
"optimizer": merge_dicts(
|
||||
DQN_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"n_step": 3,
|
||||
"num_gpus": 1,
|
||||
"num_workers": 32,
|
||||
"buffer_size": 2000000,
|
||||
"learning_starts": 50000,
|
||||
"train_batch_size": 512,
|
||||
"sample_batch_size": 50,
|
||||
"target_network_update_freq": 500000,
|
||||
"timesteps_per_iteration": 25000,
|
||||
"per_worker_exploration": True,
|
||||
"worker_side_prioritization": True,
|
||||
"min_iter_time_s": 30,
|
||||
},
|
||||
)
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def defer_make_workers(trainer, env_creator, policy, config):
|
||||
# Hack to workaround https://github.com/ray-project/ray/issues/2541
|
||||
# The workers will be creatd later, after the optimizer is created
|
||||
return trainer._make_workers(env_creator, policy, config, 0)
|
||||
|
||||
|
||||
def make_async_optimizer(workers, config):
|
||||
assert len(workers.remote_workers()) == 0
|
||||
extra_config = config["optimizer"].copy()
|
||||
for key in [
|
||||
"prioritized_replay", "prioritized_replay_alpha",
|
||||
"prioritized_replay_beta", "prioritized_replay_eps"
|
||||
]:
|
||||
if key in config:
|
||||
extra_config[key] = config[key]
|
||||
opt = AsyncReplayOptimizer(
|
||||
workers,
|
||||
learning_starts=config["learning_starts"],
|
||||
buffer_size=config["buffer_size"],
|
||||
train_batch_size=config["train_batch_size"],
|
||||
sample_batch_size=config["sample_batch_size"],
|
||||
**extra_config)
|
||||
workers.add_workers(config["num_workers"])
|
||||
opt._set_workers(workers.remote_workers())
|
||||
return opt
|
||||
|
||||
|
||||
def update_target_based_on_num_steps_trained(trainer, fetches):
|
||||
# Ape-X updates based on num steps trained, not sampled
|
||||
if (trainer.optimizer.num_steps_trained -
|
||||
trainer.state["last_target_update_ts"] >
|
||||
trainer.config["target_network_update_freq"]):
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.update_target())
|
||||
trainer.state["last_target_update_ts"] = (
|
||||
trainer.optimizer.num_steps_trained)
|
||||
trainer.state["num_target_updates"] += 1
|
||||
|
||||
|
||||
APEX_TRAINER_PROPERTIES = {
|
||||
"make_workers": defer_make_workers,
|
||||
"make_policy_optimizer": make_async_optimizer,
|
||||
"after_optimizer_step": update_target_based_on_num_steps_trained,
|
||||
}
|
||||
|
||||
ApexTrainer = DQNTrainer.with_updates(
|
||||
name="APEX", default_config=APEX_DEFAULT_CONFIG, **APEX_TRAINER_PROPERTIES)
|
||||
@@ -1,261 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class DistributionalQModel(TFModelV2):
|
||||
"""Extension of standard TFModel to provide distributional Q values.
|
||||
|
||||
It also supports options for noisy nets and parameter space noise.
|
||||
|
||||
Data flow:
|
||||
obs -> forward() -> model_out
|
||||
model_out -> get_q_value_distributions() -> Q(s, a) atoms
|
||||
model_out -> get_state_value() -> V(s)
|
||||
|
||||
Note that this class by itself is not a valid model unless you
|
||||
implement forward() in a subclass."""
|
||||
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
q_hiddens=(256, ),
|
||||
dueling=False,
|
||||
num_atoms=1,
|
||||
use_noisy=False,
|
||||
v_min=-10.0,
|
||||
v_max=10.0,
|
||||
sigma0=0.5,
|
||||
parameter_noise=False):
|
||||
"""Initialize variables of this model.
|
||||
|
||||
Extra model kwargs:
|
||||
q_hiddens (list): defines size of hidden layers for the q head.
|
||||
These will be used to postprocess the model output for the
|
||||
purposes of computing Q values.
|
||||
dueling (bool): whether to build the state value head for DDQN
|
||||
num_atoms (int): if >1, enables distributional DQN
|
||||
use_noisy (bool): use noisy nets
|
||||
v_min (float): min value support for distributional DQN
|
||||
v_max (float): max value support for distributional DQN
|
||||
sigma0 (float): initial value of noisy nets
|
||||
parameter_noise (bool): enable layer norm for param noise
|
||||
|
||||
Note that the core layers for forward() are not defined here, this
|
||||
only defines the layers for the Q head. Those layers for forward()
|
||||
should be defined in subclasses of DistributionalQModel.
|
||||
"""
|
||||
|
||||
super(DistributionalQModel, self).__init__(
|
||||
obs_space, action_space, num_outputs, model_config, name)
|
||||
|
||||
# setup the Q head output (i.e., model for get_q_values)
|
||||
self.model_out = tf.keras.layers.Input(
|
||||
shape=(num_outputs, ), name="model_out")
|
||||
|
||||
def build_action_value(model_out):
|
||||
if q_hiddens:
|
||||
action_out = model_out
|
||||
for i in range(len(q_hiddens)):
|
||||
if use_noisy:
|
||||
action_out = self._noisy_layer(
|
||||
"hidden_%d" % i, action_out, q_hiddens[i], sigma0)
|
||||
elif parameter_noise:
|
||||
import tensorflow.contrib.layers as layers
|
||||
action_out = layers.fully_connected(
|
||||
action_out,
|
||||
num_outputs=q_hiddens[i],
|
||||
activation_fn=tf.nn.relu,
|
||||
normalizer_fn=layers.layer_norm)
|
||||
else:
|
||||
action_out = tf.layers.dense(
|
||||
action_out,
|
||||
units=q_hiddens[i],
|
||||
activation=tf.nn.relu,
|
||||
name="hidden_%d" % i)
|
||||
else:
|
||||
# Avoid postprocessing the outputs. This enables custom models
|
||||
# to be used for parametric action DQN.
|
||||
action_out = model_out
|
||||
if use_noisy:
|
||||
action_scores = self._noisy_layer(
|
||||
"output",
|
||||
action_out,
|
||||
self.action_space.n * num_atoms,
|
||||
sigma0,
|
||||
non_linear=False)
|
||||
elif q_hiddens:
|
||||
action_scores = tf.layers.dense(
|
||||
action_out,
|
||||
units=self.action_space.n * num_atoms,
|
||||
activation=None)
|
||||
else:
|
||||
action_scores = model_out
|
||||
if num_atoms > 1:
|
||||
# Distributional Q-learning uses a discrete support z
|
||||
# to represent the action value distribution
|
||||
z = tf.range(num_atoms, dtype=tf.float32)
|
||||
z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
|
||||
support_logits_per_action = tf.reshape(
|
||||
tensor=action_scores,
|
||||
shape=(-1, self.action_space.n, num_atoms))
|
||||
support_prob_per_action = tf.nn.softmax(
|
||||
logits=support_logits_per_action)
|
||||
action_scores = tf.reduce_sum(
|
||||
input_tensor=z * support_prob_per_action, axis=-1)
|
||||
logits = support_logits_per_action
|
||||
dist = support_prob_per_action
|
||||
return [
|
||||
action_scores, z, support_logits_per_action, logits, dist
|
||||
]
|
||||
else:
|
||||
logits = tf.expand_dims(tf.ones_like(action_scores), -1)
|
||||
dist = tf.expand_dims(tf.ones_like(action_scores), -1)
|
||||
return [action_scores, logits, dist]
|
||||
|
||||
def build_state_score(model_out):
|
||||
state_out = model_out
|
||||
for i in range(len(q_hiddens)):
|
||||
if use_noisy:
|
||||
state_out = self._noisy_layer("dueling_hidden_%d" % i,
|
||||
state_out, q_hiddens[i],
|
||||
sigma0)
|
||||
elif parameter_noise:
|
||||
state_out = tf.contrib.layers.fully_connected(
|
||||
state_out,
|
||||
num_outputs=q_hiddens[i],
|
||||
activation_fn=tf.nn.relu,
|
||||
normalizer_fn=tf.contrib.layers.layer_norm)
|
||||
else:
|
||||
state_out = tf.layers.dense(
|
||||
state_out, units=q_hiddens[i], activation=tf.nn.relu)
|
||||
if use_noisy:
|
||||
state_score = self._noisy_layer(
|
||||
"dueling_output",
|
||||
state_out,
|
||||
num_atoms,
|
||||
sigma0,
|
||||
non_linear=False)
|
||||
else:
|
||||
state_score = tf.layers.dense(
|
||||
state_out, units=num_atoms, activation=None)
|
||||
return state_score
|
||||
|
||||
def build_action_value_in_scope(model_out):
|
||||
with tf.variable_scope(
|
||||
name + "/action_value", reuse=tf.AUTO_REUSE):
|
||||
return build_action_value(model_out)
|
||||
|
||||
def build_state_score_in_scope(model_out):
|
||||
with tf.variable_scope(name + "/state_value", reuse=tf.AUTO_REUSE):
|
||||
return build_state_score(model_out)
|
||||
|
||||
q_out = tf.keras.layers.Lambda(build_action_value_in_scope)(
|
||||
self.model_out)
|
||||
self.q_value_head = tf.keras.Model(self.model_out, q_out)
|
||||
self.register_variables(self.q_value_head.variables)
|
||||
|
||||
if dueling:
|
||||
state_out = tf.keras.layers.Lambda(build_state_score_in_scope)(
|
||||
self.model_out)
|
||||
self.state_value_head = tf.keras.Model(self.model_out, state_out)
|
||||
self.register_variables(self.state_value_head.variables)
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
"""This generates the model_out tensor input.
|
||||
|
||||
You must implement this as documented in modelv2.py."""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_q_value_distributions(self, model_out):
|
||||
"""Returns distributional values for Q(s, a) given a state embedding.
|
||||
|
||||
Override this in your custom model to customize the Q output head.
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): embedding from the model layers
|
||||
|
||||
Returns:
|
||||
(action_scores, logits, dist) if num_atoms == 1, otherwise
|
||||
(action_scores, z, support_logits_per_action, logits, dist)
|
||||
"""
|
||||
|
||||
return self.q_value_head(model_out)
|
||||
|
||||
def get_state_value(self, model_out):
|
||||
"""Returns the state value prediction for the given state embedding."""
|
||||
|
||||
return self.state_value_head(model_out)
|
||||
|
||||
def _noisy_layer(self,
|
||||
prefix,
|
||||
action_in,
|
||||
out_size,
|
||||
sigma0,
|
||||
non_linear=True):
|
||||
"""
|
||||
a common dense layer: y = w^{T}x + b
|
||||
a noisy layer: y = (w + \epsilon_w*\sigma_w)^{T}x +
|
||||
(b+\epsilon_b*\sigma_b)
|
||||
where \epsilon are random variables sampled from factorized normal
|
||||
distributions and \sigma are trainable variables which are expected to
|
||||
vanish along the training procedure
|
||||
"""
|
||||
import tensorflow.contrib.layers as layers
|
||||
|
||||
in_size = int(action_in.shape[1])
|
||||
|
||||
epsilon_in = tf.random_normal(shape=[in_size])
|
||||
epsilon_out = tf.random_normal(shape=[out_size])
|
||||
epsilon_in = self._f_epsilon(epsilon_in)
|
||||
epsilon_out = self._f_epsilon(epsilon_out)
|
||||
epsilon_w = tf.matmul(
|
||||
a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
|
||||
epsilon_b = epsilon_out
|
||||
sigma_w = tf.get_variable(
|
||||
name=prefix + "_sigma_w",
|
||||
shape=[in_size, out_size],
|
||||
dtype=tf.float32,
|
||||
initializer=tf.random_uniform_initializer(
|
||||
minval=-1.0 / np.sqrt(float(in_size)),
|
||||
maxval=1.0 / np.sqrt(float(in_size))))
|
||||
# TF noise generation can be unreliable on GPU
|
||||
# If generating the noise on the CPU,
|
||||
# lowering sigma0 to 0.1 may be helpful
|
||||
sigma_b = tf.get_variable(
|
||||
name=prefix + "_sigma_b",
|
||||
shape=[out_size],
|
||||
dtype=tf.float32, # 0.5~GPU, 0.1~CPU
|
||||
initializer=tf.constant_initializer(
|
||||
sigma0 / np.sqrt(float(in_size))))
|
||||
|
||||
w = tf.get_variable(
|
||||
name=prefix + "_fc_w",
|
||||
shape=[in_size, out_size],
|
||||
dtype=tf.float32,
|
||||
initializer=layers.xavier_initializer())
|
||||
b = tf.get_variable(
|
||||
name=prefix + "_fc_b",
|
||||
shape=[out_size],
|
||||
dtype=tf.float32,
|
||||
initializer=tf.zeros_initializer())
|
||||
|
||||
action_activation = tf.nn.xw_plus_b(action_in, w + sigma_w * epsilon_w,
|
||||
b + sigma_b * epsilon_b)
|
||||
|
||||
if not non_linear:
|
||||
return action_activation
|
||||
return tf.nn.relu(action_activation)
|
||||
|
||||
def _f_epsilon(self, x):
|
||||
return tf.sign(x) * tf.sqrt(tf.abs(x))
|
||||
@@ -1,300 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
|
||||
from ray import tune
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
|
||||
from ray.rllib.agents.dqn.simple_q_policy import SimpleQPolicy
|
||||
from ray.rllib.optimizers import SyncReplayOptimizer
|
||||
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
|
||||
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# === Model ===
|
||||
# Number of atoms for representing the distribution of return. When
|
||||
# this is greater than 1, distributional Q-learning is used.
|
||||
# the discrete supports are bounded by v_min and v_max
|
||||
"num_atoms": 1,
|
||||
"v_min": -10.0,
|
||||
"v_max": 10.0,
|
||||
# Whether to use noisy network
|
||||
"noisy": False,
|
||||
# control the initial value of noisy nets
|
||||
"sigma0": 0.5,
|
||||
# Whether to use dueling dqn
|
||||
"dueling": True,
|
||||
# Whether to use double dqn
|
||||
"double_q": True,
|
||||
# Postprocess model outputs with these hidden layers to compute the
|
||||
# state and action values. See also the model config in catalog.py.
|
||||
"hiddens": [256],
|
||||
# N-step Q learning
|
||||
"n_step": 1,
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
# exploration_fraction
|
||||
"schedule_max_timesteps": 100000,
|
||||
# Minimum env steps to optimize for per train call. This value does
|
||||
# not affect learning, only the length of iterations.
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
"exploration_fraction": 0.1,
|
||||
# Final value of random action probability
|
||||
"exploration_final_eps": 0.02,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
"target_network_update_freq": 500,
|
||||
# Use softmax for sampling actions. Required for off policy estimation.
|
||||
"soft_q": False,
|
||||
# Softmax temperature. Q values are divided by this value prior to softmax.
|
||||
# Softmax approaches argmax as the temperature drops to zero.
|
||||
"softmax_temp": 1.0,
|
||||
# If True parameter space noise will be used for exploration
|
||||
# See https://blog.openai.com/better-exploration-with-parameter-noise/
|
||||
"parameter_noise": False,
|
||||
# Extra configuration that disables exploration.
|
||||
"evaluation_config": {
|
||||
"exploration_fraction": 0,
|
||||
"exploration_final_eps": 0,
|
||||
},
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
# each worker will have a replay buffer of this size.
|
||||
"buffer_size": 50000,
|
||||
# If True prioritized replay buffer will be used.
|
||||
"prioritized_replay": True,
|
||||
# Alpha parameter for prioritized replay buffer.
|
||||
"prioritized_replay_alpha": 0.6,
|
||||
# Beta parameter for sampling from prioritized replay buffer.
|
||||
"prioritized_replay_beta": 0.4,
|
||||
# Fraction of entire training period over which the beta parameter is
|
||||
# annealed
|
||||
"beta_annealing_fraction": 0.2,
|
||||
# Final value of beta
|
||||
"final_prioritized_replay_beta": 0.4,
|
||||
# Epsilon to add to the TD errors when updating priorities.
|
||||
"prioritized_replay_eps": 1e-6,
|
||||
# Whether to LZ4 compress observations
|
||||
"compress_observations": True,
|
||||
|
||||
# === Optimization ===
|
||||
# Learning rate for adam optimizer
|
||||
"lr": 5e-4,
|
||||
# Learning rate schedule
|
||||
"lr_schedule": None,
|
||||
# Adam epsilon hyper parameter
|
||||
"adam_epsilon": 1e-8,
|
||||
# If not None, clip gradients during optimization at this value
|
||||
"grad_norm_clipping": 40,
|
||||
# How many steps of the model to sample before learning starts.
|
||||
"learning_starts": 1000,
|
||||
# Update the replay buffer with this many samples at once. Note that
|
||||
# this setting applies per-worker if num_workers > 1.
|
||||
"sample_batch_size": 4,
|
||||
# Size of a batched sampled from replay buffer for training. Note that
|
||||
# if async_updates is set, then each worker returns gradients for a
|
||||
# batch of this size.
|
||||
"train_batch_size": 32,
|
||||
|
||||
# === Parallelism ===
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you"re using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
"per_worker_exploration": False,
|
||||
# Whether to compute priorities on workers.
|
||||
"worker_side_prioritization": False,
|
||||
# Prevent iterations from going lower than this time span
|
||||
"min_iter_time_s": 1,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def make_optimizer(workers, config):
|
||||
return SyncReplayOptimizer(
|
||||
workers,
|
||||
learning_starts=config["learning_starts"],
|
||||
buffer_size=config["buffer_size"],
|
||||
prioritized_replay=config["prioritized_replay"],
|
||||
prioritized_replay_alpha=config["prioritized_replay_alpha"],
|
||||
prioritized_replay_beta=config["prioritized_replay_beta"],
|
||||
schedule_max_timesteps=config["schedule_max_timesteps"],
|
||||
beta_annealing_fraction=config["beta_annealing_fraction"],
|
||||
final_prioritized_replay_beta=config["final_prioritized_replay_beta"],
|
||||
prioritized_replay_eps=config["prioritized_replay_eps"],
|
||||
train_batch_size=config["train_batch_size"],
|
||||
sample_batch_size=config["sample_batch_size"],
|
||||
**config["optimizer"])
|
||||
|
||||
|
||||
def check_config_and_setup_param_noise(config):
|
||||
"""Update the config based on settings.
|
||||
|
||||
Rewrites sample_batch_size to take into account n_step truncation, and also
|
||||
adds the necessary callbacks to support parameter space noise exploration.
|
||||
"""
|
||||
|
||||
# Update effective batch size to include n-step
|
||||
adjusted_batch_size = max(config["sample_batch_size"],
|
||||
config.get("n_step", 1))
|
||||
config["sample_batch_size"] = adjusted_batch_size
|
||||
|
||||
if config.get("parameter_noise", False):
|
||||
if config["batch_mode"] != "complete_episodes":
|
||||
raise ValueError("Exploration with parameter space noise requires "
|
||||
"batch_mode to be complete_episodes.")
|
||||
if config.get("noisy", False):
|
||||
raise ValueError(
|
||||
"Exploration with parameter space noise and noisy network "
|
||||
"cannot be used at the same time.")
|
||||
if config["callbacks"]["on_episode_start"]:
|
||||
start_callback = config["callbacks"]["on_episode_start"]
|
||||
else:
|
||||
start_callback = None
|
||||
|
||||
def on_episode_start(info):
|
||||
# as a callback function to sample and pose parameter space
|
||||
# noise on the parameters of network
|
||||
policies = info["policy"]
|
||||
for pi in policies.values():
|
||||
pi.add_parameter_noise()
|
||||
if start_callback:
|
||||
start_callback(info)
|
||||
|
||||
config["callbacks"]["on_episode_start"] = tune.function(
|
||||
on_episode_start)
|
||||
if config["callbacks"]["on_episode_end"]:
|
||||
end_callback = config["callbacks"]["on_episode_end"]
|
||||
else:
|
||||
end_callback = None
|
||||
|
||||
def on_episode_end(info):
|
||||
# as a callback function to monitor the distance
|
||||
# between noisy policy and original policy
|
||||
policies = info["policy"]
|
||||
episode = info["episode"]
|
||||
episode.custom_metrics["policy_distance"] = policies[
|
||||
DEFAULT_POLICY_ID].model.pi_distance
|
||||
if end_callback:
|
||||
end_callback(info)
|
||||
|
||||
config["callbacks"]["on_episode_end"] = tune.function(on_episode_end)
|
||||
|
||||
|
||||
def get_initial_state(config):
|
||||
return {
|
||||
"last_target_update_ts": 0,
|
||||
"num_target_updates": 0,
|
||||
}
|
||||
|
||||
|
||||
def make_exploration_schedule(config, worker_index):
|
||||
# Use either a different `eps` per worker, or a linear schedule.
|
||||
if config["per_worker_exploration"]:
|
||||
assert config["num_workers"] > 1, \
|
||||
"This requires multiple workers"
|
||||
if worker_index >= 0:
|
||||
# Exploration constants from the Ape-X paper
|
||||
exponent = (
|
||||
1 + worker_index / float(config["num_workers"] - 1) * 7)
|
||||
return ConstantSchedule(0.4**exponent)
|
||||
else:
|
||||
# local ev should have zero exploration so that eval rollouts
|
||||
# run properly
|
||||
return ConstantSchedule(0.0)
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(
|
||||
config["exploration_fraction"] * config["schedule_max_timesteps"]),
|
||||
initial_p=1.0,
|
||||
final_p=config["exploration_final_eps"])
|
||||
|
||||
|
||||
def setup_exploration(trainer):
|
||||
trainer.exploration0 = make_exploration_schedule(trainer.config, -1)
|
||||
trainer.explorations = [
|
||||
make_exploration_schedule(trainer.config, i)
|
||||
for i in range(trainer.config["num_workers"])
|
||||
]
|
||||
|
||||
|
||||
def update_worker_explorations(trainer):
|
||||
global_timestep = trainer.optimizer.num_steps_sampled
|
||||
exp_vals = [trainer.exploration0.value(global_timestep)]
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.set_epsilon(exp_vals[0]))
|
||||
for i, e in enumerate(trainer.workers.remote_workers()):
|
||||
exp_val = trainer.explorations[i].value(global_timestep)
|
||||
e.foreach_trainable_policy.remote(lambda p, _: p.set_epsilon(exp_val))
|
||||
exp_vals.append(exp_val)
|
||||
trainer.train_start_timestep = global_timestep
|
||||
trainer.cur_exp_vals = exp_vals
|
||||
|
||||
|
||||
def add_trainer_metrics(trainer, result):
|
||||
global_timestep = trainer.optimizer.num_steps_sampled
|
||||
result.update(
|
||||
timesteps_this_iter=global_timestep - trainer.train_start_timestep,
|
||||
info=dict({
|
||||
"min_exploration": min(trainer.cur_exp_vals),
|
||||
"max_exploration": max(trainer.cur_exp_vals),
|
||||
"num_target_updates": trainer.state["num_target_updates"],
|
||||
}, **trainer.optimizer.stats()))
|
||||
|
||||
|
||||
def update_target_if_needed(trainer, fetches):
|
||||
global_timestep = trainer.optimizer.num_steps_sampled
|
||||
if global_timestep - trainer.state["last_target_update_ts"] > \
|
||||
trainer.config["target_network_update_freq"]:
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.update_target())
|
||||
trainer.state["last_target_update_ts"] = global_timestep
|
||||
trainer.state["num_target_updates"] += 1
|
||||
|
||||
|
||||
def collect_metrics(trainer):
|
||||
if trainer.config["per_worker_exploration"]:
|
||||
# Only collect metrics from the third of workers with lowest eps
|
||||
result = trainer.collect_metrics(
|
||||
selected_workers=trainer.workers.remote_workers()[
|
||||
-len(trainer.workers.remote_workers()) // 3:])
|
||||
else:
|
||||
result = trainer.collect_metrics()
|
||||
return result
|
||||
|
||||
|
||||
def disable_exploration(trainer):
|
||||
trainer.evaluation_workers.local_worker().foreach_policy(
|
||||
lambda p, _: p.set_epsilon(0))
|
||||
|
||||
|
||||
GenericOffPolicyTrainer = build_trainer(
|
||||
name="GenericOffPolicyAlgorithm",
|
||||
default_policy=None,
|
||||
default_config=DEFAULT_CONFIG,
|
||||
validate_config=check_config_and_setup_param_noise,
|
||||
get_initial_state=get_initial_state,
|
||||
make_policy_optimizer=make_optimizer,
|
||||
before_init=setup_exploration,
|
||||
before_train_step=update_worker_explorations,
|
||||
after_optimizer_step=update_target_if_needed,
|
||||
after_train_result=add_trainer_metrics,
|
||||
collect_metrics_fn=collect_metrics,
|
||||
before_evaluate_fn=disable_exploration)
|
||||
|
||||
DQNTrainer = GenericOffPolicyTrainer.with_updates(
|
||||
name="DQN", default_policy=DQNTFPolicy, default_config=DEFAULT_CONFIG)
|
||||
|
||||
SimpleQTrainer = DQNTrainer.with_updates(default_policy=SimpleQPolicy)
|
||||
@@ -1,504 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Discrete
|
||||
import numpy as np
|
||||
from scipy.stats import entropy
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.dqn.distributional_q_model import DistributionalQModel
|
||||
from ray.rllib.agents.dqn.simple_q_policy import ExplorationStateMixin, \
|
||||
TargetNetworkMixin
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.tf_ops import huber_loss, reduce_mean_ignore_inf, \
|
||||
minimize_and_clip
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
|
||||
# Importance sampling weights for prioritized replay
|
||||
PRIO_WEIGHTS = "weights"
|
||||
|
||||
|
||||
class QLoss(object):
|
||||
def __init__(self,
|
||||
q_t_selected,
|
||||
q_logits_t_selected,
|
||||
q_tp1_best,
|
||||
q_dist_tp1_best,
|
||||
importance_weights,
|
||||
rewards,
|
||||
done_mask,
|
||||
gamma=0.99,
|
||||
n_step=1,
|
||||
num_atoms=1,
|
||||
v_min=-10.0,
|
||||
v_max=10.0):
|
||||
|
||||
if num_atoms > 1:
|
||||
# Distributional Q-learning which corresponds to an entropy loss
|
||||
|
||||
z = tf.range(num_atoms, dtype=tf.float32)
|
||||
z = v_min + z * (v_max - v_min) / float(num_atoms - 1)
|
||||
|
||||
# (batch_size, 1) * (1, num_atoms) = (batch_size, num_atoms)
|
||||
r_tau = tf.expand_dims(
|
||||
rewards, -1) + gamma**n_step * tf.expand_dims(
|
||||
1.0 - done_mask, -1) * tf.expand_dims(z, 0)
|
||||
r_tau = tf.clip_by_value(r_tau, v_min, v_max)
|
||||
b = (r_tau - v_min) / ((v_max - v_min) / float(num_atoms - 1))
|
||||
lb = tf.floor(b)
|
||||
ub = tf.ceil(b)
|
||||
# indispensable judgement which is missed in most implementations
|
||||
# when b happens to be an integer, lb == ub, so pr_j(s', a*) will
|
||||
# be discarded because (ub-b) == (b-lb) == 0
|
||||
floor_equal_ceil = tf.to_float(tf.less(ub - lb, 0.5))
|
||||
|
||||
l_project = tf.one_hot(
|
||||
tf.cast(lb, dtype=tf.int32),
|
||||
num_atoms) # (batch_size, num_atoms, num_atoms)
|
||||
u_project = tf.one_hot(
|
||||
tf.cast(ub, dtype=tf.int32),
|
||||
num_atoms) # (batch_size, num_atoms, num_atoms)
|
||||
ml_delta = q_dist_tp1_best * (ub - b + floor_equal_ceil)
|
||||
mu_delta = q_dist_tp1_best * (b - lb)
|
||||
ml_delta = tf.reduce_sum(
|
||||
l_project * tf.expand_dims(ml_delta, -1), axis=1)
|
||||
mu_delta = tf.reduce_sum(
|
||||
u_project * tf.expand_dims(mu_delta, -1), axis=1)
|
||||
m = ml_delta + mu_delta
|
||||
|
||||
# Rainbow paper claims that using this cross entropy loss for
|
||||
# priority is robust and insensitive to `prioritized_replay_alpha`
|
||||
self.td_error = tf.nn.softmax_cross_entropy_with_logits(
|
||||
labels=m, logits=q_logits_t_selected)
|
||||
self.loss = tf.reduce_mean(self.td_error * importance_weights)
|
||||
self.stats = {
|
||||
# TODO: better Q stats for dist dqn
|
||||
"mean_td_error": tf.reduce_mean(self.td_error),
|
||||
}
|
||||
else:
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
self.td_error = (
|
||||
q_t_selected - tf.stop_gradient(q_t_selected_target))
|
||||
self.loss = tf.reduce_mean(
|
||||
importance_weights * huber_loss(self.td_error))
|
||||
self.stats = {
|
||||
"mean_q": tf.reduce_mean(q_t_selected),
|
||||
"min_q": tf.reduce_min(q_t_selected),
|
||||
"max_q": tf.reduce_max(q_t_selected),
|
||||
"mean_td_error": tf.reduce_mean(self.td_error),
|
||||
}
|
||||
|
||||
|
||||
class QValuePolicy(object):
|
||||
def __init__(self, q_values, observations, num_actions, stochastic, eps,
|
||||
softmax, softmax_temp):
|
||||
if softmax:
|
||||
action_dist = Categorical(q_values / softmax_temp)
|
||||
self.action = action_dist.sample()
|
||||
self.action_prob = action_dist.sampled_action_prob()
|
||||
return
|
||||
|
||||
deterministic_actions = tf.argmax(q_values, axis=1)
|
||||
batch_size = tf.shape(observations)[0]
|
||||
|
||||
# Special case masked out actions (q_value ~= -inf) so that we don't
|
||||
# even consider them for exploration.
|
||||
random_valid_action_logits = tf.where(
|
||||
tf.equal(q_values, tf.float32.min),
|
||||
tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
|
||||
random_actions = tf.squeeze(
|
||||
tf.multinomial(random_valid_action_logits, 1), axis=1)
|
||||
|
||||
chose_random = tf.random_uniform(
|
||||
tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
|
||||
stochastic_actions = tf.where(chose_random, random_actions,
|
||||
deterministic_actions)
|
||||
self.action = tf.cond(stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
self.action_prob = None
|
||||
|
||||
|
||||
class ComputeTDErrorMixin(object):
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
if not self.loss_initialized():
|
||||
return np.zeros_like(rew_t)
|
||||
|
||||
td_err = self.get_session().run(
|
||||
self.q_loss.td_error,
|
||||
feed_dict={
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): [
|
||||
np.array(ob) for ob in obs_t
|
||||
],
|
||||
self.get_placeholder(SampleBatch.ACTIONS): act_t,
|
||||
self.get_placeholder(SampleBatch.REWARDS): rew_t,
|
||||
self.get_placeholder(SampleBatch.NEXT_OBS): [
|
||||
np.array(ob) for ob in obs_tp1
|
||||
],
|
||||
self.get_placeholder(SampleBatch.DONES): done_mask,
|
||||
self.get_placeholder(PRIO_WEIGHTS): importance_weights,
|
||||
})
|
||||
return td_err
|
||||
|
||||
|
||||
def postprocess_trajectory(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
if policy.config["parameter_noise"]:
|
||||
# adjust the sigma of parameter space noise
|
||||
states = [list(x) for x in sample_batch.columns(["obs"])][0]
|
||||
|
||||
noisy_action_distribution = policy.get_session().run(
|
||||
policy.action_probs, feed_dict={policy.cur_observations: states})
|
||||
policy.get_session().run(policy.remove_noise_op)
|
||||
clean_action_distribution = policy.get_session().run(
|
||||
policy.action_probs, feed_dict={policy.cur_observations: states})
|
||||
distance_in_action_space = np.mean(
|
||||
entropy(clean_action_distribution.T, noisy_action_distribution.T))
|
||||
policy.pi_distance = distance_in_action_space
|
||||
if (distance_in_action_space <
|
||||
-np.log(1 - policy.cur_epsilon +
|
||||
policy.cur_epsilon / policy.num_actions)):
|
||||
policy.parameter_noise_sigma_val *= 1.01
|
||||
else:
|
||||
policy.parameter_noise_sigma_val /= 1.01
|
||||
policy.parameter_noise_sigma.load(
|
||||
policy.parameter_noise_sigma_val, session=policy.get_session())
|
||||
|
||||
return _postprocess_dqn(policy, sample_batch)
|
||||
|
||||
|
||||
def build_q_model(policy, obs_space, action_space, config):
|
||||
|
||||
if not isinstance(action_space, Discrete):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DQN.".format(action_space))
|
||||
|
||||
if config["hiddens"]:
|
||||
# try to infer the last layer size, otherwise fall back to 256
|
||||
num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1]
|
||||
config["model"]["no_final_linear"] = True
|
||||
else:
|
||||
num_outputs = action_space.n
|
||||
|
||||
policy.q_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
model_interface=DistributionalQModel,
|
||||
name=Q_SCOPE,
|
||||
num_atoms=config["num_atoms"],
|
||||
q_hiddens=config["hiddens"],
|
||||
dueling=config["dueling"],
|
||||
use_noisy=config["noisy"],
|
||||
v_min=config["v_min"],
|
||||
v_max=config["v_max"],
|
||||
sigma0=config["sigma0"],
|
||||
parameter_noise=config["parameter_noise"])
|
||||
|
||||
policy.target_q_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
model_interface=DistributionalQModel,
|
||||
name=Q_TARGET_SCOPE,
|
||||
num_atoms=config["num_atoms"],
|
||||
q_hiddens=config["hiddens"],
|
||||
dueling=config["dueling"],
|
||||
use_noisy=config["noisy"],
|
||||
v_min=config["v_min"],
|
||||
v_max=config["v_max"],
|
||||
sigma0=config["sigma0"],
|
||||
parameter_noise=config["parameter_noise"])
|
||||
|
||||
return policy.q_model
|
||||
|
||||
|
||||
def build_q_networks(policy, q_model, input_dict, obs_space, action_space,
|
||||
config):
|
||||
|
||||
# Action Q network
|
||||
q_values, q_logits, q_dist = _compute_q_values(
|
||||
policy, q_model, input_dict[SampleBatch.CUR_OBS], obs_space,
|
||||
action_space)
|
||||
policy.q_values = q_values
|
||||
policy.q_func_vars = q_model.variables()
|
||||
|
||||
# Noise vars for Q network except for layer normalization vars
|
||||
if config["parameter_noise"]:
|
||||
_build_parameter_noise(
|
||||
policy,
|
||||
[var for var in policy.q_func_vars if "LayerNorm" not in var.name])
|
||||
policy.action_probs = tf.nn.softmax(policy.q_values)
|
||||
|
||||
# Action outputs
|
||||
qvp = QValuePolicy(q_values, input_dict[SampleBatch.CUR_OBS],
|
||||
action_space.n, policy.stochastic, policy.eps,
|
||||
config["soft_q"], config["softmax_temp"])
|
||||
policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob
|
||||
|
||||
return policy.output_actions, policy.action_prob
|
||||
|
||||
|
||||
def _build_parameter_noise(policy, pnet_params):
|
||||
policy.parameter_noise_sigma_val = 1.0
|
||||
policy.parameter_noise_sigma = tf.get_variable(
|
||||
initializer=tf.constant_initializer(policy.parameter_noise_sigma_val),
|
||||
name="parameter_noise_sigma",
|
||||
shape=(),
|
||||
trainable=False,
|
||||
dtype=tf.float32)
|
||||
policy.parameter_noise = list()
|
||||
# No need to add any noise on LayerNorm parameters
|
||||
for var in pnet_params:
|
||||
noise_var = tf.get_variable(
|
||||
name=var.name.split(":")[0] + "_noise",
|
||||
shape=var.shape,
|
||||
initializer=tf.constant_initializer(.0),
|
||||
trainable=False)
|
||||
policy.parameter_noise.append(noise_var)
|
||||
remove_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, policy.parameter_noise):
|
||||
remove_noise_ops.append(tf.assign_add(var, -var_noise))
|
||||
policy.remove_noise_op = tf.group(*tuple(remove_noise_ops))
|
||||
generate_noise_ops = list()
|
||||
for var_noise in policy.parameter_noise:
|
||||
generate_noise_ops.append(
|
||||
tf.assign(
|
||||
var_noise,
|
||||
tf.random_normal(
|
||||
shape=var_noise.shape,
|
||||
stddev=policy.parameter_noise_sigma)))
|
||||
with tf.control_dependencies(generate_noise_ops):
|
||||
add_noise_ops = list()
|
||||
for var, var_noise in zip(pnet_params, policy.parameter_noise):
|
||||
add_noise_ops.append(tf.assign_add(var, var_noise))
|
||||
policy.add_noise_op = tf.group(*tuple(add_noise_ops))
|
||||
policy.pi_distance = None
|
||||
|
||||
|
||||
def build_q_losses(policy, batch_tensors):
|
||||
config = policy.config
|
||||
# q network evaluation
|
||||
q_t, q_logits_t, q_dist_t = _compute_q_values(
|
||||
policy, policy.q_model, batch_tensors[SampleBatch.CUR_OBS],
|
||||
policy.observation_space, policy.action_space)
|
||||
|
||||
# target q network evalution
|
||||
q_tp1, q_logits_tp1, q_dist_tp1 = _compute_q_values(
|
||||
policy, policy.target_q_model, batch_tensors[SampleBatch.NEXT_OBS],
|
||||
policy.observation_space, policy.action_space)
|
||||
policy.target_q_func_vars = policy.target_q_model.variables()
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
one_hot_selection = tf.one_hot(
|
||||
tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32),
|
||||
policy.action_space.n)
|
||||
q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
|
||||
q_logits_t_selected = tf.reduce_sum(
|
||||
q_logits_t * tf.expand_dims(one_hot_selection, -1), 1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if config["double_q"]:
|
||||
q_tp1_using_online_net, q_logits_tp1_using_online_net, \
|
||||
q_dist_tp1_using_online_net = _compute_q_values(
|
||||
policy, policy.q_model,
|
||||
batch_tensors[SampleBatch.NEXT_OBS],
|
||||
policy.observation_space, policy.action_space)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best_one_hot_selection = tf.one_hot(q_tp1_best_using_online_net,
|
||||
policy.action_space.n)
|
||||
q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
|
||||
q_dist_tp1_best = tf.reduce_sum(
|
||||
q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
|
||||
else:
|
||||
q_tp1_best_one_hot_selection = tf.one_hot(
|
||||
tf.argmax(q_tp1, 1), policy.action_space.n)
|
||||
q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
|
||||
q_dist_tp1_best = tf.reduce_sum(
|
||||
q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
|
||||
|
||||
policy.q_loss = QLoss(
|
||||
q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best,
|
||||
batch_tensors[PRIO_WEIGHTS], batch_tensors[SampleBatch.REWARDS],
|
||||
tf.cast(batch_tensors[SampleBatch.DONES],
|
||||
tf.float32), config["gamma"], config["n_step"],
|
||||
config["num_atoms"], config["v_min"], config["v_max"])
|
||||
|
||||
return policy.q_loss.loss
|
||||
|
||||
|
||||
def adam_optimizer(policy, config):
|
||||
return tf.train.AdamOptimizer(
|
||||
learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"])
|
||||
|
||||
|
||||
def clip_gradients(policy, optimizer, loss):
|
||||
if policy.config["grad_norm_clipping"] is not None:
|
||||
grads_and_vars = minimize_and_clip(
|
||||
optimizer,
|
||||
loss,
|
||||
var_list=policy.q_func_vars,
|
||||
clip_val=policy.config["grad_norm_clipping"])
|
||||
else:
|
||||
grads_and_vars = optimizer.compute_gradients(
|
||||
loss, var_list=policy.q_func_vars)
|
||||
grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
|
||||
return grads_and_vars
|
||||
|
||||
|
||||
def exploration_setting_inputs(policy):
|
||||
return {
|
||||
policy.stochastic: True,
|
||||
policy.eps: policy.cur_epsilon,
|
||||
}
|
||||
|
||||
|
||||
def build_q_stats(policy, batch_tensors):
|
||||
return dict({
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
}, **policy.q_loss.stats)
|
||||
|
||||
|
||||
def setup_early_mixins(policy, obs_space, action_space, config):
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
|
||||
|
||||
|
||||
def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
|
||||
|
||||
|
||||
def _compute_q_values(policy, model, obs, obs_space, action_space):
|
||||
config = policy.config
|
||||
model_out, state = model({
|
||||
"obs": obs,
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
if config["num_atoms"] > 1:
|
||||
(action_scores, z, support_logits_per_action, logits,
|
||||
dist) = model.get_q_value_distributions(model_out)
|
||||
else:
|
||||
(action_scores, logits,
|
||||
dist) = model.get_q_value_distributions(model_out)
|
||||
|
||||
if config["dueling"]:
|
||||
state_score = model.get_state_value(model_out)
|
||||
if config["num_atoms"] > 1:
|
||||
support_logits_per_action_mean = tf.reduce_mean(
|
||||
support_logits_per_action, 1)
|
||||
support_logits_per_action_centered = (
|
||||
support_logits_per_action - tf.expand_dims(
|
||||
support_logits_per_action_mean, 1))
|
||||
support_logits_per_action = tf.expand_dims(
|
||||
state_score, 1) + support_logits_per_action_centered
|
||||
support_prob_per_action = tf.nn.softmax(
|
||||
logits=support_logits_per_action)
|
||||
value = tf.reduce_sum(
|
||||
input_tensor=z * support_prob_per_action, axis=-1)
|
||||
logits = support_logits_per_action
|
||||
dist = support_prob_per_action
|
||||
else:
|
||||
action_scores_mean = reduce_mean_ignore_inf(action_scores, 1)
|
||||
action_scores_centered = action_scores - tf.expand_dims(
|
||||
action_scores_mean, 1)
|
||||
value = state_score + action_scores_centered
|
||||
else:
|
||||
value = action_scores
|
||||
|
||||
return value, logits, dist
|
||||
|
||||
|
||||
def _adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
|
||||
"""Rewrites the given trajectory fragments to encode n-step rewards.
|
||||
|
||||
reward[i] = (
|
||||
reward[i] * gamma**0 +
|
||||
reward[i+1] * gamma**1 +
|
||||
... +
|
||||
reward[i+n_step-1] * gamma**(n_step-1))
|
||||
|
||||
The ith new_obs is also adjusted to point to the (i+n_step-1)'th new obs.
|
||||
|
||||
At the end of the trajectory, n is truncated to fit in the traj length.
|
||||
"""
|
||||
|
||||
assert not any(dones[:-1]), "Unexpected done in middle of trajectory"
|
||||
|
||||
traj_length = len(rewards)
|
||||
for i in range(traj_length):
|
||||
for j in range(1, n_step):
|
||||
if i + j < traj_length:
|
||||
new_obs[i] = new_obs[i + j]
|
||||
dones[i] = dones[i + j]
|
||||
rewards[i] += gamma**j * rewards[i + j]
|
||||
|
||||
|
||||
def _postprocess_dqn(policy, batch):
|
||||
# N-step Q adjustments
|
||||
if policy.config["n_step"] > 1:
|
||||
_adjust_nstep(policy.config["n_step"], policy.config["gamma"],
|
||||
batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
|
||||
batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
|
||||
batch[SampleBatch.DONES])
|
||||
|
||||
if PRIO_WEIGHTS not in batch:
|
||||
batch[PRIO_WEIGHTS] = np.ones_like(batch[SampleBatch.REWARDS])
|
||||
|
||||
# Prioritize on the worker side
|
||||
if batch.count > 0 and policy.config["worker_side_prioritization"]:
|
||||
td_errors = policy.compute_td_error(
|
||||
batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
|
||||
batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
|
||||
batch[SampleBatch.DONES], batch[PRIO_WEIGHTS])
|
||||
new_priorities = (
|
||||
np.abs(td_errors) + policy.config["prioritized_replay_eps"])
|
||||
batch.data[PRIO_WEIGHTS] = new_priorities
|
||||
|
||||
return batch
|
||||
|
||||
|
||||
DQNTFPolicy = build_tf_policy(
|
||||
name="DQNTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
|
||||
make_model=build_q_model,
|
||||
action_sampler_fn=build_q_networks,
|
||||
loss_fn=build_q_losses,
|
||||
stats_fn=build_q_stats,
|
||||
postprocess_fn=postprocess_trajectory,
|
||||
optimizer_fn=adam_optimizer,
|
||||
gradients_fn=clip_gradients,
|
||||
extra_action_feed_fn=exploration_setting_inputs,
|
||||
extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
|
||||
extra_learn_fetches_fn=lambda policy: {"td_error": policy.q_loss.td_error},
|
||||
before_init=setup_early_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
obs_include_prev_action_reward=False,
|
||||
mixins=[
|
||||
ExplorationStateMixin,
|
||||
TargetNetworkMixin,
|
||||
ComputeTDErrorMixin,
|
||||
LearningRateSchedule,
|
||||
])
|
||||
@@ -1,82 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class SimpleQModel(TFModelV2):
|
||||
"""Extension of standard TFModel to provide Q values.
|
||||
|
||||
Data flow:
|
||||
obs -> forward() -> model_out
|
||||
model_out -> get_q_values() -> Q(s, a)
|
||||
|
||||
Note that this class by itself is not a valid model unless you
|
||||
implement forward() in a subclass."""
|
||||
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
q_hiddens=(256, )):
|
||||
"""Initialize variables of this model.
|
||||
|
||||
Extra model kwargs:
|
||||
q_hiddens (list): defines size of hidden layers for the q head.
|
||||
These will be used to postprocess the model output for the
|
||||
purposes of computing Q values.
|
||||
|
||||
Note that the core layers for forward() are not defined here, this
|
||||
only defines the layers for the Q head. Those layers for forward()
|
||||
should be defined in subclasses of SimpleQModel.
|
||||
"""
|
||||
|
||||
super(SimpleQModel, self).__init__(obs_space, action_space,
|
||||
num_outputs, model_config, name)
|
||||
|
||||
# setup the Q head output (i.e., model for get_q_values)
|
||||
self.model_out = tf.keras.layers.Input(
|
||||
shape=(num_outputs, ), name="model_out")
|
||||
|
||||
if q_hiddens:
|
||||
last_layer = self.model_out
|
||||
for i, n in enumerate(q_hiddens):
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
n, name="q_hidden_{}".format(i),
|
||||
activation=tf.nn.relu)(last_layer)
|
||||
q_out = tf.keras.layers.Dense(
|
||||
action_space.n, activation=None, name="q_out")(last_layer)
|
||||
else:
|
||||
q_out = self.model_out
|
||||
|
||||
self.q_value_head = tf.keras.Model(self.model_out, q_out)
|
||||
self.register_variables(self.q_value_head.variables)
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
"""This generates the model_out tensor input.
|
||||
|
||||
You must implement this as documented in modelv2.py."""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_q_values(self, model_out):
|
||||
"""Returns Q(s, a) given a feature tensor for the state.
|
||||
|
||||
Override this in your custom model to customize the Q output head.
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): embedding from the model layers
|
||||
|
||||
Returns:
|
||||
action scores Q(s, a) for each action, shape [None, action_space.n]
|
||||
"""
|
||||
|
||||
return self.q_value_head(model_out)
|
||||
@@ -1,214 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
"""Basic example of a DQN policy without any optimizations."""
|
||||
|
||||
from gym.spaces import Discrete
|
||||
import logging
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.dqn.simple_q_model import SimpleQModel
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.policy.tf_policy import TFPolicy
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import huber_loss
|
||||
|
||||
tf = try_import_tf()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
|
||||
|
||||
class ExplorationStateMixin(object):
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
self.cur_epsilon = 1.0
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
|
||||
def add_parameter_noise(self):
|
||||
if self.config["parameter_noise"]:
|
||||
self.sess.run(self.add_noise_op)
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
self.cur_epsilon = epsilon
|
||||
|
||||
@override(Policy)
|
||||
def get_state(self):
|
||||
return [TFPolicy.get_state(self), self.cur_epsilon]
|
||||
|
||||
@override(Policy)
|
||||
def set_state(self, state):
|
||||
TFPolicy.set_state(self, state[0])
|
||||
self.set_epsilon(state[1])
|
||||
|
||||
|
||||
class TargetNetworkMixin(object):
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
update_target_expr = []
|
||||
assert len(self.q_func_vars) == len(self.target_q_func_vars), \
|
||||
(self.q_func_vars, self.target_q_func_vars)
|
||||
for var, var_target in zip(self.q_func_vars, self.target_q_func_vars):
|
||||
update_target_expr.append(var_target.assign(var))
|
||||
logger.debug("Update target op {}".format(var_target))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
def update_target(self):
|
||||
return self.get_session().run(self.update_target_expr)
|
||||
|
||||
|
||||
def build_q_models(policy, obs_space, action_space, config):
|
||||
|
||||
if not isinstance(action_space, Discrete):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for DQN.".format(action_space))
|
||||
|
||||
if config["hiddens"]:
|
||||
num_outputs = 256
|
||||
config["model"]["no_final_linear"] = True
|
||||
else:
|
||||
num_outputs = action_space.n
|
||||
|
||||
policy.q_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
name=Q_SCOPE,
|
||||
model_interface=SimpleQModel,
|
||||
q_hiddens=config["hiddens"])
|
||||
|
||||
policy.target_q_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
name=Q_TARGET_SCOPE,
|
||||
model_interface=SimpleQModel,
|
||||
q_hiddens=config["hiddens"])
|
||||
|
||||
return policy.q_model
|
||||
|
||||
|
||||
def build_action_sampler(policy, q_model, input_dict, obs_space, action_space,
|
||||
config):
|
||||
|
||||
# Action Q network
|
||||
q_values = _compute_q_values(policy, q_model,
|
||||
input_dict[SampleBatch.CUR_OBS], obs_space,
|
||||
action_space)
|
||||
policy.q_values = q_values
|
||||
policy.q_func_vars = q_model.variables()
|
||||
|
||||
# Action outputs
|
||||
deterministic_actions = tf.argmax(q_values, axis=1)
|
||||
batch_size = tf.shape(input_dict[SampleBatch.CUR_OBS])[0]
|
||||
|
||||
# Special case masked out actions (q_value ~= -inf) so that we don't
|
||||
# even consider them for exploration.
|
||||
random_valid_action_logits = tf.where(
|
||||
tf.equal(q_values, tf.float32.min),
|
||||
tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values))
|
||||
random_actions = tf.squeeze(
|
||||
tf.multinomial(random_valid_action_logits, 1), axis=1)
|
||||
|
||||
chose_random = tf.random_uniform(
|
||||
tf.stack([batch_size]), minval=0, maxval=1,
|
||||
dtype=tf.float32) < policy.eps
|
||||
stochastic_actions = tf.where(chose_random, random_actions,
|
||||
deterministic_actions)
|
||||
action = tf.cond(policy.stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
action_prob = None
|
||||
|
||||
return action, action_prob
|
||||
|
||||
|
||||
def build_q_losses(policy, batch_tensors):
|
||||
# q network evaluation
|
||||
q_t = _compute_q_values(policy, policy.q_model,
|
||||
batch_tensors[SampleBatch.CUR_OBS],
|
||||
policy.observation_space, policy.action_space)
|
||||
|
||||
# target q network evalution
|
||||
q_tp1 = _compute_q_values(policy, policy.target_q_model,
|
||||
batch_tensors[SampleBatch.NEXT_OBS],
|
||||
policy.observation_space, policy.action_space)
|
||||
policy.target_q_func_vars = policy.target_q_model.variables()
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
one_hot_selection = tf.one_hot(
|
||||
tf.cast(batch_tensors[SampleBatch.ACTIONS], tf.int32),
|
||||
policy.action_space.n)
|
||||
q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
dones = tf.cast(batch_tensors[SampleBatch.DONES], tf.float32)
|
||||
q_tp1_best_one_hot_selection = tf.one_hot(
|
||||
tf.argmax(q_tp1, 1), policy.action_space.n)
|
||||
q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
|
||||
q_tp1_best_masked = (1.0 - dones) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = (batch_tensors[SampleBatch.REWARDS] +
|
||||
policy.config["gamma"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
loss = tf.reduce_mean(huber_loss(td_error))
|
||||
|
||||
# save TD error as an attribute for outside access
|
||||
policy.td_error = td_error
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
def _compute_q_values(policy, model, obs, obs_space, action_space):
|
||||
input_dict = {
|
||||
"obs": obs,
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}
|
||||
model_out, _ = model(input_dict, [], None)
|
||||
return model.get_q_values(model_out)
|
||||
|
||||
|
||||
def exploration_setting_inputs(policy):
|
||||
return {
|
||||
policy.stochastic: True,
|
||||
policy.eps: policy.cur_epsilon,
|
||||
}
|
||||
|
||||
|
||||
def setup_early_mixins(policy, obs_space, action_space, config):
|
||||
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
|
||||
|
||||
|
||||
def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
|
||||
|
||||
|
||||
SimpleQPolicy = build_tf_policy(
|
||||
name="SimpleQPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG,
|
||||
make_model=build_q_models,
|
||||
action_sampler_fn=build_action_sampler,
|
||||
loss_fn=build_q_losses,
|
||||
extra_action_feed_fn=exploration_setting_inputs,
|
||||
extra_action_fetches_fn=lambda policy: {"q_values": policy.q_values},
|
||||
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
|
||||
before_init=setup_early_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
obs_include_prev_action_reward=False,
|
||||
mixins=[
|
||||
ExplorationStateMixin,
|
||||
TargetNetworkMixin,
|
||||
])
|
||||
@@ -1,6 +0,0 @@
|
||||
from ray.rllib.agents.es.es import (ESTrainer, DEFAULT_CONFIG)
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
ESAgent = renamed_agent(ESTrainer)
|
||||
|
||||
__all__ = ["ESAgent", "ESTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,337 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import namedtuple
|
||||
import logging
|
||||
import numpy as np
|
||||
import time
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents import Trainer, with_common_config
|
||||
|
||||
from ray.rllib.agents.es import optimizers
|
||||
from ray.rllib.agents.es import policies
|
||||
from ray.rllib.agents.es import utils
|
||||
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.memory import ray_get_and_free
|
||||
from ray.rllib.utils import FilterManager
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Result = namedtuple("Result", [
|
||||
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
|
||||
"eval_returns", "eval_lengths"
|
||||
])
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
"l2_coeff": 0.005,
|
||||
"noise_stdev": 0.02,
|
||||
"episodes_per_batch": 1000,
|
||||
"train_batch_size": 10000,
|
||||
"eval_prob": 0.003,
|
||||
"return_proc_mode": "centered_rank",
|
||||
"num_workers": 10,
|
||||
"stepsize": 0.01,
|
||||
"observation_filter": "MeanStdFilter",
|
||||
"noise_size": 250000000,
|
||||
"report_length": 10,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@ray.remote
|
||||
def create_shared_noise(count):
|
||||
"""Create a large array of noise to be shared by all workers."""
|
||||
seed = 123
|
||||
noise = np.random.RandomState(seed).randn(count).astype(np.float32)
|
||||
return noise
|
||||
|
||||
|
||||
class SharedNoiseTable(object):
|
||||
def __init__(self, noise):
|
||||
self.noise = noise
|
||||
assert self.noise.dtype == np.float32
|
||||
|
||||
def get(self, i, dim):
|
||||
return self.noise[i:i + dim]
|
||||
|
||||
def sample_index(self, dim):
|
||||
return np.random.randint(0, len(self.noise) - dim + 1)
|
||||
|
||||
|
||||
@ray.remote
|
||||
class Worker(object):
|
||||
def __init__(self,
|
||||
config,
|
||||
policy_params,
|
||||
env_creator,
|
||||
noise,
|
||||
min_task_runtime=0.2):
|
||||
self.min_task_runtime = min_task_runtime
|
||||
self.config = config
|
||||
self.policy_params = policy_params
|
||||
self.noise = SharedNoiseTable(noise)
|
||||
|
||||
self.env = env_creator(config["env_config"])
|
||||
from ray.rllib import models
|
||||
self.preprocessor = models.ModelCatalog.get_preprocessor(
|
||||
self.env, config["model"])
|
||||
|
||||
self.sess = utils.make_session(single_threaded=True)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, self.env.action_space, self.env.observation_space,
|
||||
self.preprocessor, config["observation_filter"], config["model"],
|
||||
**policy_params)
|
||||
|
||||
@property
|
||||
def filters(self):
|
||||
return {DEFAULT_POLICY_ID: self.policy.get_filter()}
|
||||
|
||||
def sync_filters(self, new_filters):
|
||||
for k in self.filters:
|
||||
self.filters[k].sync(new_filters[k])
|
||||
|
||||
def get_filters(self, flush_after=False):
|
||||
return_filters = {}
|
||||
for k, f in self.filters.items():
|
||||
return_filters[k] = f.as_serializable()
|
||||
if flush_after:
|
||||
f.clear_buffer()
|
||||
return return_filters
|
||||
|
||||
def rollout(self, timestep_limit, add_noise=True):
|
||||
rollout_rewards, rollout_length = policies.rollout(
|
||||
self.policy,
|
||||
self.env,
|
||||
timestep_limit=timestep_limit,
|
||||
add_noise=add_noise)
|
||||
return rollout_rewards, rollout_length
|
||||
|
||||
def do_rollouts(self, params, timestep_limit=None):
|
||||
# Set the network weights.
|
||||
self.policy.set_weights(params)
|
||||
|
||||
noise_indices, returns, sign_returns, lengths = [], [], [], []
|
||||
eval_returns, eval_lengths = [], []
|
||||
|
||||
# Perform some rollouts with noise.
|
||||
task_tstart = time.time()
|
||||
while (len(noise_indices) == 0
|
||||
or time.time() - task_tstart < self.min_task_runtime):
|
||||
|
||||
if np.random.uniform() < self.config["eval_prob"]:
|
||||
# Do an evaluation run with no perturbation.
|
||||
self.policy.set_weights(params)
|
||||
rewards, length = self.rollout(timestep_limit, add_noise=False)
|
||||
eval_returns.append(rewards.sum())
|
||||
eval_lengths.append(length)
|
||||
else:
|
||||
# Do a regular run with parameter perturbations.
|
||||
noise_index = self.noise.sample_index(self.policy.num_params)
|
||||
|
||||
perturbation = self.config["noise_stdev"] * self.noise.get(
|
||||
noise_index, self.policy.num_params)
|
||||
|
||||
# These two sampling steps could be done in parallel on
|
||||
# different actors letting us update twice as frequently.
|
||||
self.policy.set_weights(params + perturbation)
|
||||
rewards_pos, lengths_pos = self.rollout(timestep_limit)
|
||||
|
||||
self.policy.set_weights(params - perturbation)
|
||||
rewards_neg, lengths_neg = self.rollout(timestep_limit)
|
||||
|
||||
noise_indices.append(noise_index)
|
||||
returns.append([rewards_pos.sum(), rewards_neg.sum()])
|
||||
sign_returns.append(
|
||||
[np.sign(rewards_pos).sum(),
|
||||
np.sign(rewards_neg).sum()])
|
||||
lengths.append([lengths_pos, lengths_neg])
|
||||
|
||||
return Result(
|
||||
noise_indices=noise_indices,
|
||||
noisy_returns=returns,
|
||||
sign_noisy_returns=sign_returns,
|
||||
noisy_lengths=lengths,
|
||||
eval_returns=eval_returns,
|
||||
eval_lengths=eval_lengths)
|
||||
|
||||
|
||||
class ESTrainer(Trainer):
|
||||
"""Large-scale implementation of Evolution Strategies in Ray."""
|
||||
|
||||
_name = "ES"
|
||||
_default_config = DEFAULT_CONFIG
|
||||
|
||||
@override(Trainer)
|
||||
def _init(self, config, env_creator):
|
||||
policy_params = {"action_noise_std": 0.01}
|
||||
|
||||
env = env_creator(config["env_config"])
|
||||
from ray.rllib import models
|
||||
preprocessor = models.ModelCatalog.get_preprocessor(env)
|
||||
|
||||
self.sess = utils.make_session(single_threaded=False)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, env.action_space, env.observation_space, preprocessor,
|
||||
config["observation_filter"], config["model"], **policy_params)
|
||||
self.optimizer = optimizers.Adam(self.policy, config["stepsize"])
|
||||
self.report_length = config["report_length"]
|
||||
|
||||
# Create the shared noise table.
|
||||
logger.info("Creating shared noise table.")
|
||||
noise_id = create_shared_noise.remote(config["noise_size"])
|
||||
self.noise = SharedNoiseTable(ray.get(noise_id))
|
||||
|
||||
# Create the actors.
|
||||
logger.info("Creating actors.")
|
||||
self._workers = [
|
||||
Worker.remote(config, policy_params, env_creator, noise_id)
|
||||
for _ in range(config["num_workers"])
|
||||
]
|
||||
|
||||
self.episodes_so_far = 0
|
||||
self.reward_list = []
|
||||
self.tstart = time.time()
|
||||
|
||||
@override(Trainer)
|
||||
def _train(self):
|
||||
config = self.config
|
||||
|
||||
theta = self.policy.get_weights()
|
||||
assert theta.dtype == np.float32
|
||||
|
||||
# Put the current policy weights in the object store.
|
||||
theta_id = ray.put(theta)
|
||||
# Use the actors to do rollouts, note that we pass in the ID of the
|
||||
# policy weights.
|
||||
results, num_episodes, num_timesteps = self._collect_results(
|
||||
theta_id, config["episodes_per_batch"], config["train_batch_size"])
|
||||
|
||||
all_noise_indices = []
|
||||
all_training_returns = []
|
||||
all_training_lengths = []
|
||||
all_eval_returns = []
|
||||
all_eval_lengths = []
|
||||
|
||||
# Loop over the results.
|
||||
for result in results:
|
||||
all_eval_returns += result.eval_returns
|
||||
all_eval_lengths += result.eval_lengths
|
||||
|
||||
all_noise_indices += result.noise_indices
|
||||
all_training_returns += result.noisy_returns
|
||||
all_training_lengths += result.noisy_lengths
|
||||
|
||||
assert len(all_eval_returns) == len(all_eval_lengths)
|
||||
assert (len(all_noise_indices) == len(all_training_returns) ==
|
||||
len(all_training_lengths))
|
||||
|
||||
self.episodes_so_far += num_episodes
|
||||
|
||||
# Assemble the results.
|
||||
eval_returns = np.array(all_eval_returns)
|
||||
eval_lengths = np.array(all_eval_lengths)
|
||||
noise_indices = np.array(all_noise_indices)
|
||||
noisy_returns = np.array(all_training_returns)
|
||||
noisy_lengths = np.array(all_training_lengths)
|
||||
|
||||
# Process the returns.
|
||||
if config["return_proc_mode"] == "centered_rank":
|
||||
proc_noisy_returns = utils.compute_centered_ranks(noisy_returns)
|
||||
else:
|
||||
raise NotImplementedError(config["return_proc_mode"])
|
||||
|
||||
# Compute and take a step.
|
||||
g, count = utils.batched_weighted_sum(
|
||||
proc_noisy_returns[:, 0] - proc_noisy_returns[:, 1],
|
||||
(self.noise.get(index, self.policy.num_params)
|
||||
for index in noise_indices),
|
||||
batch_size=500)
|
||||
g /= noisy_returns.size
|
||||
assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
|
||||
and count == len(noise_indices))
|
||||
# Compute the new weights theta.
|
||||
theta, update_ratio = self.optimizer.update(-g +
|
||||
config["l2_coeff"] * theta)
|
||||
# Set the new weights in the local copy of the policy.
|
||||
self.policy.set_weights(theta)
|
||||
# Store the rewards
|
||||
if len(all_eval_returns) > 0:
|
||||
self.reward_list.append(np.mean(eval_returns))
|
||||
|
||||
# Now sync the filters
|
||||
FilterManager.synchronize({
|
||||
DEFAULT_POLICY_ID: self.policy.get_filter()
|
||||
}, self._workers)
|
||||
|
||||
info = {
|
||||
"weights_norm": np.square(theta).sum(),
|
||||
"grad_norm": np.square(g).sum(),
|
||||
"update_ratio": update_ratio,
|
||||
"episodes_this_iter": noisy_lengths.size,
|
||||
"episodes_so_far": self.episodes_so_far,
|
||||
}
|
||||
|
||||
reward_mean = np.mean(self.reward_list[-self.report_length:])
|
||||
result = dict(
|
||||
episode_reward_mean=reward_mean,
|
||||
episode_len_mean=eval_lengths.mean(),
|
||||
timesteps_this_iter=noisy_lengths.sum(),
|
||||
info=info)
|
||||
|
||||
return result
|
||||
|
||||
@override(Trainer)
|
||||
def compute_action(self, observation):
|
||||
return self.policy.compute(observation, update=False)[0]
|
||||
|
||||
@override(Trainer)
|
||||
def _stop(self):
|
||||
# workaround for https://github.com/ray-project/ray/issues/1516
|
||||
for w in self._workers:
|
||||
w.__ray_terminate__.remote()
|
||||
|
||||
def _collect_results(self, theta_id, min_episodes, min_timesteps):
|
||||
num_episodes, num_timesteps = 0, 0
|
||||
results = []
|
||||
while num_episodes < min_episodes or num_timesteps < min_timesteps:
|
||||
logger.info(
|
||||
"Collected {} episodes {} timesteps so far this iter".format(
|
||||
num_episodes, num_timesteps))
|
||||
rollout_ids = [
|
||||
worker.do_rollouts.remote(theta_id) for worker in self._workers
|
||||
]
|
||||
# Get the results of the rollouts.
|
||||
for result in ray_get_and_free(rollout_ids):
|
||||
results.append(result)
|
||||
# Update the number of episodes and the number of timesteps
|
||||
# keeping in mind that result.noisy_lengths is a list of lists,
|
||||
# where the inner lists have length 2.
|
||||
num_episodes += sum(len(pair) for pair in result.noisy_lengths)
|
||||
num_timesteps += sum(
|
||||
sum(pair) for pair in result.noisy_lengths)
|
||||
|
||||
return results, num_episodes, num_timesteps
|
||||
|
||||
def __getstate__(self):
|
||||
return {
|
||||
"weights": self.policy.get_weights(),
|
||||
"filter": self.policy.get_filter(),
|
||||
"episodes_so_far": self.episodes_so_far,
|
||||
}
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.episodes_so_far = state["episodes_so_far"]
|
||||
self.policy.set_weights(state["weights"])
|
||||
self.policy.set_filter(state["filter"])
|
||||
FilterManager.synchronize({
|
||||
DEFAULT_POLICY_ID: self.policy.get_filter()
|
||||
}, self._workers)
|
||||
@@ -1,56 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class Optimizer(object):
|
||||
def __init__(self, pi):
|
||||
self.pi = pi
|
||||
self.dim = pi.num_params
|
||||
self.t = 0
|
||||
|
||||
def update(self, globalg):
|
||||
self.t += 1
|
||||
step = self._compute_step(globalg)
|
||||
theta = self.pi.get_weights()
|
||||
ratio = np.linalg.norm(step) / np.linalg.norm(theta)
|
||||
return theta + step, ratio
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class SGD(Optimizer):
|
||||
def __init__(self, pi, stepsize, momentum=0.9):
|
||||
Optimizer.__init__(self, pi)
|
||||
self.v = np.zeros(self.dim, dtype=np.float32)
|
||||
self.stepsize, self.momentum = stepsize, momentum
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
self.v = self.momentum * self.v + (1. - self.momentum) * globalg
|
||||
step = -self.stepsize * self.v
|
||||
return step
|
||||
|
||||
|
||||
class Adam(Optimizer):
|
||||
def __init__(self, pi, stepsize, beta1=0.9, beta2=0.999, epsilon=1e-08):
|
||||
Optimizer.__init__(self, pi)
|
||||
self.stepsize = stepsize
|
||||
self.beta1 = beta1
|
||||
self.beta2 = beta2
|
||||
self.epsilon = epsilon
|
||||
self.m = np.zeros(self.dim, dtype=np.float32)
|
||||
self.v = np.zeros(self.dim, dtype=np.float32)
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
|
||||
(1 - self.beta1**self.t))
|
||||
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
|
||||
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
|
||||
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
|
||||
return step
|
||||
@@ -1,93 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
import ray.experimental.tf_utils
|
||||
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
def rollout(policy, env, timestep_limit=None, add_noise=False):
|
||||
"""Do a rollout.
|
||||
|
||||
If add_noise is True, the rollout will take noisy actions with
|
||||
noise drawn from that stream. Otherwise, no action noise will be added.
|
||||
"""
|
||||
env_timestep_limit = env.spec.max_episode_steps
|
||||
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
|
||||
timestep_limit, env_timestep_limit))
|
||||
rews = []
|
||||
t = 0
|
||||
observation = env.reset()
|
||||
for _ in range(timestep_limit or 999999):
|
||||
ac = policy.compute(observation, add_noise=add_noise)[0]
|
||||
observation, rew, done, _ = env.step(ac)
|
||||
rews.append(rew)
|
||||
t += 1
|
||||
if done:
|
||||
break
|
||||
rews = np.array(rews, dtype=np.float32)
|
||||
return rews, t
|
||||
|
||||
|
||||
class GenericPolicy(object):
|
||||
def __init__(self, sess, action_space, obs_space, preprocessor,
|
||||
observation_filter, model_options, action_noise_std):
|
||||
self.sess = sess
|
||||
self.action_space = action_space
|
||||
self.action_noise_std = action_noise_std
|
||||
self.preprocessor = preprocessor
|
||||
self.observation_filter = get_filter(observation_filter,
|
||||
self.preprocessor.shape)
|
||||
self.inputs = tf.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
|
||||
# Policy network.
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
self.action_space, model_options, dist_type="deterministic")
|
||||
model = ModelCatalog.get_model({
|
||||
"obs": self.inputs
|
||||
}, obs_space, action_space, dist_dim, model_options)
|
||||
dist = dist_class(model.outputs)
|
||||
self.sampler = dist.sample()
|
||||
|
||||
self.variables = ray.experimental.tf_utils.TensorFlowVariables(
|
||||
model.outputs, self.sess)
|
||||
|
||||
self.num_params = sum(
|
||||
np.prod(variable.shape.as_list())
|
||||
for _, variable in self.variables.variables.items())
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def compute(self, observation, add_noise=False, update=True):
|
||||
observation = self.preprocessor.transform(observation)
|
||||
observation = self.observation_filter(observation[None], update=update)
|
||||
action = self.sess.run(
|
||||
self.sampler, feed_dict={self.inputs: observation})
|
||||
action = _unbatch_tuple_actions(action)
|
||||
if add_noise and isinstance(self.action_space, gym.spaces.Box):
|
||||
action += np.random.randn(*action.shape) * self.action_noise_std
|
||||
return action
|
||||
|
||||
def set_weights(self, x):
|
||||
self.variables.set_flat(x)
|
||||
|
||||
def get_weights(self):
|
||||
return self.variables.get_flat()
|
||||
|
||||
def get_filter(self):
|
||||
return self.observation_filter
|
||||
|
||||
def set_filter(self, observation_filter):
|
||||
self.observation_filter = observation_filter
|
||||
@@ -1,63 +0,0 @@
|
||||
# Code in this file is copied and adapted from
|
||||
# https://github.com/openai/evolution-strategies-starter.
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
def compute_ranks(x):
|
||||
"""Returns ranks in [0, len(x))
|
||||
|
||||
Note: This is different from scipy.stats.rankdata, which returns ranks in
|
||||
[1, len(x)].
|
||||
"""
|
||||
assert x.ndim == 1
|
||||
ranks = np.empty(len(x), dtype=int)
|
||||
ranks[x.argsort()] = np.arange(len(x))
|
||||
return ranks
|
||||
|
||||
|
||||
def compute_centered_ranks(x):
|
||||
y = compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
|
||||
y /= (x.size - 1)
|
||||
y -= 0.5
|
||||
return y
|
||||
|
||||
|
||||
def make_session(single_threaded):
|
||||
if not single_threaded:
|
||||
return tf.Session()
|
||||
return tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
|
||||
|
||||
|
||||
def itergroups(items, group_size):
|
||||
assert group_size >= 1
|
||||
group = []
|
||||
for x in items:
|
||||
group.append(x)
|
||||
if len(group) == group_size:
|
||||
yield tuple(group)
|
||||
del group[:]
|
||||
if group:
|
||||
yield tuple(group)
|
||||
|
||||
|
||||
def batched_weighted_sum(weights, vecs, batch_size):
|
||||
total = 0
|
||||
num_items_summed = 0
|
||||
for batch_weights, batch_vecs in zip(
|
||||
itergroups(weights, batch_size), itergroups(vecs, batch_size)):
|
||||
assert len(batch_weights) == len(batch_vecs) <= batch_size
|
||||
total += np.dot(
|
||||
np.asarray(batch_weights, dtype=np.float32),
|
||||
np.asarray(batch_vecs, dtype=np.float32))
|
||||
num_items_summed += len(batch_weights)
|
||||
return total, num_items_summed
|
||||
@@ -1,6 +0,0 @@
|
||||
from ray.rllib.agents.impala.impala import ImpalaTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
ImpalaAgent = renamed_agent(ImpalaTrainer)
|
||||
|
||||
__all__ = ["ImpalaAgent", "ImpalaTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,164 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy
|
||||
from ray.rllib.agents.impala.vtrace_policy import VTraceTFPolicy
|
||||
from ray.rllib.agents.trainer import Trainer, with_common_config
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.optimizers import AsyncSamplesOptimizer
|
||||
from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.tune.trainable import Trainable
|
||||
from ray.tune.resources import Resources
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# V-trace params (see vtrace.py).
|
||||
"vtrace": True,
|
||||
"vtrace_clip_rho_threshold": 1.0,
|
||||
"vtrace_clip_pg_rho_threshold": 1.0,
|
||||
|
||||
# System params.
|
||||
#
|
||||
# == Overview of data flow in IMPALA ==
|
||||
# 1. Policy evaluation in parallel across `num_workers` actors produces
|
||||
# batches of size `sample_batch_size * num_envs_per_worker`.
|
||||
# 2. If enabled, the replay buffer stores and produces batches of size
|
||||
# `sample_batch_size * num_envs_per_worker`.
|
||||
# 3. If enabled, the minibatch ring buffer stores and replays batches of
|
||||
# size `train_batch_size` up to `num_sgd_iter` times per batch.
|
||||
# 4. The learner thread executes data parallel SGD across `num_gpus` GPUs
|
||||
# on batches of size `train_batch_size`.
|
||||
#
|
||||
"sample_batch_size": 50,
|
||||
"train_batch_size": 500,
|
||||
"min_iter_time_s": 10,
|
||||
"num_workers": 2,
|
||||
# number of GPUs the learner should use.
|
||||
"num_gpus": 1,
|
||||
# set >1 to load data into GPUs in parallel. Increases GPU memory usage
|
||||
# proportionally with the number of buffers.
|
||||
"num_data_loader_buffers": 1,
|
||||
# how many train batches should be retained for minibatching. This conf
|
||||
# only has an effect if `num_sgd_iter > 1`.
|
||||
"minibatch_buffer_size": 1,
|
||||
# number of passes to make over each train batch
|
||||
"num_sgd_iter": 1,
|
||||
# set >0 to enable experience replay. Saved samples will be replayed with
|
||||
# a p:1 proportion to new data samples.
|
||||
"replay_proportion": 0.0,
|
||||
# number of sample batches to store for replay. The number of transitions
|
||||
# saved total will be (replay_buffer_num_slots * sample_batch_size).
|
||||
"replay_buffer_num_slots": 0,
|
||||
# max queue size for train batches feeding into the learner
|
||||
"learner_queue_size": 16,
|
||||
# wait for train batches to be available in minibatch buffer queue
|
||||
# this many seconds. This may need to be increased e.g. when training
|
||||
# with a slow environment
|
||||
"learner_queue_timeout": 300,
|
||||
# level of queuing for sampling.
|
||||
"max_sample_requests_in_flight_per_worker": 2,
|
||||
# max number of workers to broadcast one set of weights to
|
||||
"broadcast_interval": 1,
|
||||
# use intermediate actors for multi-level aggregation. This can make sense
|
||||
# if ingesting >2GB/s of samples, or if the data requires decompression.
|
||||
"num_aggregation_workers": 0,
|
||||
|
||||
# Learning params.
|
||||
"grad_clip": 40.0,
|
||||
# either "adam" or "rmsprop"
|
||||
"opt_type": "adam",
|
||||
"lr": 0.0005,
|
||||
"lr_schedule": None,
|
||||
# rmsprop considered
|
||||
"decay": 0.99,
|
||||
"momentum": 0.0,
|
||||
"epsilon": 0.1,
|
||||
# balancing the three losses
|
||||
"vf_loss_coeff": 0.5,
|
||||
"entropy_coeff": 0.01,
|
||||
"entropy_coeff_schedule": None,
|
||||
|
||||
# use fake (infinite speed) sampler for testing
|
||||
"_fake_sampler": False,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def choose_policy(config):
|
||||
if config["vtrace"]:
|
||||
return VTraceTFPolicy
|
||||
else:
|
||||
return A3CTFPolicy
|
||||
|
||||
|
||||
def validate_config(config):
|
||||
if config["entropy_coeff"] < 0:
|
||||
raise DeprecationWarning("entropy_coeff must be >= 0")
|
||||
|
||||
|
||||
def defer_make_workers(trainer, env_creator, policy, config):
|
||||
# Defer worker creation to after the optimizer has been created.
|
||||
return trainer._make_workers(env_creator, policy, config, 0)
|
||||
|
||||
|
||||
def make_aggregators_and_optimizer(workers, config):
|
||||
if config["num_aggregation_workers"] > 0:
|
||||
# Create co-located aggregator actors first for placement pref
|
||||
aggregators = TreeAggregator.precreate_aggregators(
|
||||
config["num_aggregation_workers"])
|
||||
else:
|
||||
aggregators = None
|
||||
workers.add_workers(config["num_workers"])
|
||||
|
||||
optimizer = AsyncSamplesOptimizer(
|
||||
workers,
|
||||
lr=config["lr"],
|
||||
num_gpus=config["num_gpus"],
|
||||
sample_batch_size=config["sample_batch_size"],
|
||||
train_batch_size=config["train_batch_size"],
|
||||
replay_buffer_num_slots=config["replay_buffer_num_slots"],
|
||||
replay_proportion=config["replay_proportion"],
|
||||
num_data_loader_buffers=config["num_data_loader_buffers"],
|
||||
max_sample_requests_in_flight_per_worker=config[
|
||||
"max_sample_requests_in_flight_per_worker"],
|
||||
broadcast_interval=config["broadcast_interval"],
|
||||
num_sgd_iter=config["num_sgd_iter"],
|
||||
minibatch_buffer_size=config["minibatch_buffer_size"],
|
||||
num_aggregation_workers=config["num_aggregation_workers"],
|
||||
learner_queue_size=config["learner_queue_size"],
|
||||
learner_queue_timeout=config["learner_queue_timeout"],
|
||||
**config["optimizer"])
|
||||
|
||||
if aggregators:
|
||||
# Assign the pre-created aggregators to the optimizer
|
||||
optimizer.aggregator.init(aggregators)
|
||||
return optimizer
|
||||
|
||||
|
||||
class OverrideDefaultResourceRequest(object):
|
||||
@classmethod
|
||||
@override(Trainable)
|
||||
def default_resource_request(cls, config):
|
||||
cf = dict(cls._default_config, **config)
|
||||
Trainer._validate_config(cf)
|
||||
return Resources(
|
||||
cpu=cf["num_cpus_for_driver"],
|
||||
gpu=cf["num_gpus"],
|
||||
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"] +
|
||||
cf["num_aggregation_workers"],
|
||||
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
|
||||
|
||||
|
||||
ImpalaTrainer = build_trainer(
|
||||
name="IMPALA",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=VTraceTFPolicy,
|
||||
validate_config=validate_config,
|
||||
get_policy_class=choose_policy,
|
||||
make_workers=defer_make_workers,
|
||||
make_policy_optimizer=make_aggregators_and_optimizer,
|
||||
mixins=[OverrideDefaultResourceRequest])
|
||||
@@ -1,409 +0,0 @@
|
||||
# Copyright 2018 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Functions to compute V-trace off-policy actor critic targets.
|
||||
|
||||
For details and theory see:
|
||||
|
||||
"IMPALA: Scalable Distributed Deep-RL with
|
||||
Importance Weighted Actor-Learner Architectures"
|
||||
by Espeholt, Soyer, Munos et al.
|
||||
|
||||
See https://arxiv.org/abs/1802.01561 for the full paper.
|
||||
|
||||
In addition to the original paper's code, changes have been made
|
||||
to support MultiDiscrete action spaces. behaviour_policy_logits,
|
||||
target_policy_logits and actions parameters in the entry point
|
||||
multi_from_logits method accepts lists of tensors instead of just
|
||||
tensors.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
VTraceFromLogitsReturns = collections.namedtuple("VTraceFromLogitsReturns", [
|
||||
"vs", "pg_advantages", "log_rhos", "behaviour_action_log_probs",
|
||||
"target_action_log_probs"
|
||||
])
|
||||
|
||||
VTraceReturns = collections.namedtuple("VTraceReturns", "vs pg_advantages")
|
||||
|
||||
|
||||
def log_probs_from_logits_and_actions(policy_logits,
|
||||
actions,
|
||||
dist_class=Categorical):
|
||||
return multi_log_probs_from_logits_and_actions([policy_logits], [actions],
|
||||
dist_class)[0]
|
||||
|
||||
|
||||
def multi_log_probs_from_logits_and_actions(policy_logits, actions,
|
||||
dist_class):
|
||||
"""Computes action log-probs from policy logits and actions.
|
||||
|
||||
In the notation used throughout documentation and comments, T refers to the
|
||||
time dimension ranging from 0 to T-1. B refers to the batch size and
|
||||
ACTION_SPACE refers to the list of numbers each representing a number of
|
||||
actions.
|
||||
|
||||
Args:
|
||||
policy_logits: A list with length of ACTION_SPACE of float32
|
||||
tensors of shapes
|
||||
[T, B, ACTION_SPACE[0]],
|
||||
...,
|
||||
[T, B, ACTION_SPACE[-1]]
|
||||
with un-normalized log-probabilities parameterizing a softmax policy.
|
||||
actions: A list with length of ACTION_SPACE of
|
||||
tensors of shapes
|
||||
[T, B, ...],
|
||||
...,
|
||||
[T, B, ...]
|
||||
with actions.
|
||||
|
||||
Returns:
|
||||
A list with length of ACTION_SPACE of float32
|
||||
tensors of shapes
|
||||
[T, B],
|
||||
...,
|
||||
[T, B]
|
||||
corresponding to the sampling log probability
|
||||
of the chosen action w.r.t. the policy.
|
||||
"""
|
||||
|
||||
log_probs = []
|
||||
for i in range(len(policy_logits)):
|
||||
p_shape = tf.shape(policy_logits[i])
|
||||
a_shape = tf.shape(actions[i])
|
||||
policy_logits_flat = tf.reshape(policy_logits[i],
|
||||
tf.concat([[-1], p_shape[2:]], axis=0))
|
||||
actions_flat = tf.reshape(actions[i],
|
||||
tf.concat([[-1], a_shape[2:]], axis=0))
|
||||
log_probs.append(
|
||||
tf.reshape(
|
||||
dist_class(policy_logits_flat).logp(actions_flat),
|
||||
a_shape[:2]))
|
||||
|
||||
return log_probs
|
||||
|
||||
|
||||
def from_logits(behaviour_policy_logits,
|
||||
target_policy_logits,
|
||||
actions,
|
||||
discounts,
|
||||
rewards,
|
||||
values,
|
||||
bootstrap_value,
|
||||
dist_class=Categorical,
|
||||
clip_rho_threshold=1.0,
|
||||
clip_pg_rho_threshold=1.0,
|
||||
name="vtrace_from_logits"):
|
||||
"""multi_from_logits wrapper used only for tests"""
|
||||
|
||||
res = multi_from_logits(
|
||||
[behaviour_policy_logits], [target_policy_logits], [actions],
|
||||
discounts,
|
||||
rewards,
|
||||
values,
|
||||
bootstrap_value,
|
||||
dist_class,
|
||||
clip_rho_threshold=clip_rho_threshold,
|
||||
clip_pg_rho_threshold=clip_pg_rho_threshold,
|
||||
name=name)
|
||||
|
||||
return VTraceFromLogitsReturns(
|
||||
vs=res.vs,
|
||||
pg_advantages=res.pg_advantages,
|
||||
log_rhos=res.log_rhos,
|
||||
behaviour_action_log_probs=tf.squeeze(
|
||||
res.behaviour_action_log_probs, axis=0),
|
||||
target_action_log_probs=tf.squeeze(
|
||||
res.target_action_log_probs, axis=0),
|
||||
)
|
||||
|
||||
|
||||
def multi_from_logits(behaviour_policy_logits,
|
||||
target_policy_logits,
|
||||
actions,
|
||||
discounts,
|
||||
rewards,
|
||||
values,
|
||||
bootstrap_value,
|
||||
dist_class,
|
||||
clip_rho_threshold=1.0,
|
||||
clip_pg_rho_threshold=1.0,
|
||||
name="vtrace_from_logits"):
|
||||
r"""V-trace for softmax policies.
|
||||
|
||||
Calculates V-trace actor critic targets for softmax polices as described in
|
||||
|
||||
"IMPALA: Scalable Distributed Deep-RL with
|
||||
Importance Weighted Actor-Learner Architectures"
|
||||
by Espeholt, Soyer, Munos et al.
|
||||
|
||||
Target policy refers to the policy we are interested in improving and
|
||||
behaviour policy refers to the policy that generated the given
|
||||
rewards and actions.
|
||||
|
||||
In the notation used throughout documentation and comments, T refers to the
|
||||
time dimension ranging from 0 to T-1. B refers to the batch size and
|
||||
ACTION_SPACE refers to the list of numbers each representing a number of
|
||||
actions.
|
||||
|
||||
Args:
|
||||
behaviour_policy_logits: A list with length of ACTION_SPACE of float32
|
||||
tensors of shapes
|
||||
[T, B, ACTION_SPACE[0]],
|
||||
...,
|
||||
[T, B, ACTION_SPACE[-1]]
|
||||
with un-normalized log-probabilities parameterizing the softmax behaviour
|
||||
policy.
|
||||
target_policy_logits: A list with length of ACTION_SPACE of float32
|
||||
tensors of shapes
|
||||
[T, B, ACTION_SPACE[0]],
|
||||
...,
|
||||
[T, B, ACTION_SPACE[-1]]
|
||||
with un-normalized log-probabilities parameterizing the softmax target
|
||||
policy.
|
||||
actions: A list with length of ACTION_SPACE of
|
||||
tensors of shapes
|
||||
[T, B, ...],
|
||||
...,
|
||||
[T, B, ...]
|
||||
with actions sampled from the behaviour policy.
|
||||
discounts: A float32 tensor of shape [T, B] with the discount encountered
|
||||
when following the behaviour policy.
|
||||
rewards: A float32 tensor of shape [T, B] with the rewards generated by
|
||||
following the behaviour policy.
|
||||
values: A float32 tensor of shape [T, B] with the value function estimates
|
||||
wrt. the target policy.
|
||||
bootstrap_value: A float32 of shape [B] with the value function estimate at
|
||||
time T.
|
||||
dist_class: action distribution class for the logits.
|
||||
clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
|
||||
importance weights (rho) when calculating the baseline targets (vs).
|
||||
rho^bar in the paper.
|
||||
clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
|
||||
on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
|
||||
name: The name scope that all V-trace operations will be created in.
|
||||
|
||||
Returns:
|
||||
A `VTraceFromLogitsReturns` namedtuple with the following fields:
|
||||
vs: A float32 tensor of shape [T, B]. Can be used as target to train a
|
||||
baseline (V(x_t) - vs_t)^2.
|
||||
pg_advantages: A float 32 tensor of shape [T, B]. Can be used as an
|
||||
estimate of the advantage in the calculation of policy gradients.
|
||||
log_rhos: A float32 tensor of shape [T, B] containing the log importance
|
||||
sampling weights (log rhos).
|
||||
behaviour_action_log_probs: A float32 tensor of shape [T, B] containing
|
||||
behaviour policy action log probabilities (log \mu(a_t)).
|
||||
target_action_log_probs: A float32 tensor of shape [T, B] containing
|
||||
target policy action probabilities (log \pi(a_t)).
|
||||
"""
|
||||
|
||||
for i in range(len(behaviour_policy_logits)):
|
||||
behaviour_policy_logits[i] = tf.convert_to_tensor(
|
||||
behaviour_policy_logits[i], dtype=tf.float32)
|
||||
target_policy_logits[i] = tf.convert_to_tensor(
|
||||
target_policy_logits[i], dtype=tf.float32)
|
||||
|
||||
# Make sure tensor ranks are as expected.
|
||||
# The rest will be checked by from_action_log_probs.
|
||||
behaviour_policy_logits[i].shape.assert_has_rank(3)
|
||||
target_policy_logits[i].shape.assert_has_rank(3)
|
||||
|
||||
with tf.name_scope(
|
||||
name,
|
||||
values=[
|
||||
behaviour_policy_logits, target_policy_logits, actions,
|
||||
discounts, rewards, values, bootstrap_value
|
||||
]):
|
||||
target_action_log_probs = multi_log_probs_from_logits_and_actions(
|
||||
target_policy_logits, actions, dist_class)
|
||||
behaviour_action_log_probs = multi_log_probs_from_logits_and_actions(
|
||||
behaviour_policy_logits, actions, dist_class)
|
||||
|
||||
log_rhos = get_log_rhos(target_action_log_probs,
|
||||
behaviour_action_log_probs)
|
||||
|
||||
vtrace_returns = from_importance_weights(
|
||||
log_rhos=log_rhos,
|
||||
discounts=discounts,
|
||||
rewards=rewards,
|
||||
values=values,
|
||||
bootstrap_value=bootstrap_value,
|
||||
clip_rho_threshold=clip_rho_threshold,
|
||||
clip_pg_rho_threshold=clip_pg_rho_threshold)
|
||||
|
||||
return VTraceFromLogitsReturns(
|
||||
log_rhos=log_rhos,
|
||||
behaviour_action_log_probs=behaviour_action_log_probs,
|
||||
target_action_log_probs=target_action_log_probs,
|
||||
**vtrace_returns._asdict())
|
||||
|
||||
|
||||
def from_importance_weights(log_rhos,
|
||||
discounts,
|
||||
rewards,
|
||||
values,
|
||||
bootstrap_value,
|
||||
clip_rho_threshold=1.0,
|
||||
clip_pg_rho_threshold=1.0,
|
||||
name="vtrace_from_importance_weights"):
|
||||
r"""V-trace from log importance weights.
|
||||
|
||||
Calculates V-trace actor critic targets as described in
|
||||
|
||||
"IMPALA: Scalable Distributed Deep-RL with
|
||||
Importance Weighted Actor-Learner Architectures"
|
||||
by Espeholt, Soyer, Munos et al.
|
||||
|
||||
In the notation used throughout documentation and comments, T refers to the
|
||||
time dimension ranging from 0 to T-1. B refers to the batch size. This code
|
||||
also supports the case where all tensors have the same number of additional
|
||||
dimensions, e.g., `rewards` is [T, B, C], `values` is [T, B, C],
|
||||
`bootstrap_value` is [B, C].
|
||||
|
||||
Args:
|
||||
log_rhos: A float32 tensor of shape [T, B] representing the
|
||||
log importance sampling weights, i.e.
|
||||
log(target_policy(a) / behaviour_policy(a)). V-trace performs operations
|
||||
on rhos in log-space for numerical stability.
|
||||
discounts: A float32 tensor of shape [T, B] with discounts encountered when
|
||||
following the behaviour policy.
|
||||
rewards: A float32 tensor of shape [T, B] containing rewards generated by
|
||||
following the behaviour policy.
|
||||
values: A float32 tensor of shape [T, B] with the value function estimates
|
||||
wrt. the target policy.
|
||||
bootstrap_value: A float32 of shape [B] with the value function estimate at
|
||||
time T.
|
||||
clip_rho_threshold: A scalar float32 tensor with the clipping threshold for
|
||||
importance weights (rho) when calculating the baseline targets (vs).
|
||||
rho^bar in the paper. If None, no clipping is applied.
|
||||
clip_pg_rho_threshold: A scalar float32 tensor with the clipping threshold
|
||||
on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). If
|
||||
None, no clipping is applied.
|
||||
name: The name scope that all V-trace operations will be created in.
|
||||
|
||||
Returns:
|
||||
A VTraceReturns namedtuple (vs, pg_advantages) where:
|
||||
vs: A float32 tensor of shape [T, B]. Can be used as target to
|
||||
train a baseline (V(x_t) - vs_t)^2.
|
||||
pg_advantages: A float32 tensor of shape [T, B]. Can be used as the
|
||||
advantage in the calculation of policy gradients.
|
||||
"""
|
||||
log_rhos = tf.convert_to_tensor(log_rhos, dtype=tf.float32)
|
||||
discounts = tf.convert_to_tensor(discounts, dtype=tf.float32)
|
||||
rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
|
||||
values = tf.convert_to_tensor(values, dtype=tf.float32)
|
||||
bootstrap_value = tf.convert_to_tensor(bootstrap_value, dtype=tf.float32)
|
||||
if clip_rho_threshold is not None:
|
||||
clip_rho_threshold = tf.convert_to_tensor(
|
||||
clip_rho_threshold, dtype=tf.float32)
|
||||
if clip_pg_rho_threshold is not None:
|
||||
clip_pg_rho_threshold = tf.convert_to_tensor(
|
||||
clip_pg_rho_threshold, dtype=tf.float32)
|
||||
|
||||
# Make sure tensor ranks are consistent.
|
||||
rho_rank = log_rhos.shape.ndims # Usually 2.
|
||||
values.shape.assert_has_rank(rho_rank)
|
||||
bootstrap_value.shape.assert_has_rank(rho_rank - 1)
|
||||
discounts.shape.assert_has_rank(rho_rank)
|
||||
rewards.shape.assert_has_rank(rho_rank)
|
||||
if clip_rho_threshold is not None:
|
||||
clip_rho_threshold.shape.assert_has_rank(0)
|
||||
if clip_pg_rho_threshold is not None:
|
||||
clip_pg_rho_threshold.shape.assert_has_rank(0)
|
||||
|
||||
with tf.name_scope(
|
||||
name,
|
||||
values=[log_rhos, discounts, rewards, values, bootstrap_value]):
|
||||
rhos = tf.exp(log_rhos)
|
||||
if clip_rho_threshold is not None:
|
||||
clipped_rhos = tf.minimum(
|
||||
clip_rho_threshold, rhos, name="clipped_rhos")
|
||||
|
||||
tf.summary.histogram("clipped_rhos_1000", tf.minimum(1000.0, rhos))
|
||||
tf.summary.scalar(
|
||||
"num_of_clipped_rhos",
|
||||
tf.reduce_sum(
|
||||
tf.cast(
|
||||
tf.equal(clipped_rhos, clip_rho_threshold), tf.int32)))
|
||||
tf.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
|
||||
else:
|
||||
clipped_rhos = rhos
|
||||
|
||||
cs = tf.minimum(1.0, rhos, name="cs")
|
||||
# Append bootstrapped value to get [v1, ..., v_t+1]
|
||||
values_t_plus_1 = tf.concat(
|
||||
[values[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
|
||||
deltas = clipped_rhos * (
|
||||
rewards + discounts * values_t_plus_1 - values)
|
||||
|
||||
# All sequences are reversed, computation starts from the back.
|
||||
sequences = (
|
||||
tf.reverse(discounts, axis=[0]),
|
||||
tf.reverse(cs, axis=[0]),
|
||||
tf.reverse(deltas, axis=[0]),
|
||||
)
|
||||
|
||||
# V-trace vs are calculated through a scan from the back to the
|
||||
# beginning of the given trajectory.
|
||||
def scanfunc(acc, sequence_item):
|
||||
discount_t, c_t, delta_t = sequence_item
|
||||
return delta_t + discount_t * c_t * acc
|
||||
|
||||
initial_values = tf.zeros_like(bootstrap_value)
|
||||
vs_minus_v_xs = tf.scan(
|
||||
fn=scanfunc,
|
||||
elems=sequences,
|
||||
initializer=initial_values,
|
||||
parallel_iterations=1,
|
||||
back_prop=False,
|
||||
name="scan")
|
||||
# Reverse the results back to original order.
|
||||
vs_minus_v_xs = tf.reverse(vs_minus_v_xs, [0], name="vs_minus_v_xs")
|
||||
|
||||
# Add V(x_s) to get v_s.
|
||||
vs = tf.add(vs_minus_v_xs, values, name="vs")
|
||||
|
||||
# Advantage for policy gradient.
|
||||
vs_t_plus_1 = tf.concat(
|
||||
[vs[1:], tf.expand_dims(bootstrap_value, 0)], axis=0)
|
||||
if clip_pg_rho_threshold is not None:
|
||||
clipped_pg_rhos = tf.minimum(
|
||||
clip_pg_rho_threshold, rhos, name="clipped_pg_rhos")
|
||||
else:
|
||||
clipped_pg_rhos = rhos
|
||||
pg_advantages = (
|
||||
clipped_pg_rhos * (rewards + discounts * vs_t_plus_1 - values))
|
||||
|
||||
# Make sure no gradients backpropagated through the returned values.
|
||||
return VTraceReturns(
|
||||
vs=tf.stop_gradient(vs),
|
||||
pg_advantages=tf.stop_gradient(pg_advantages))
|
||||
|
||||
|
||||
def get_log_rhos(target_action_log_probs, behaviour_action_log_probs):
|
||||
"""With the selected log_probs for multi-discrete actions of behaviour
|
||||
and target policies we compute the log_rhos for calculating the vtrace."""
|
||||
t = tf.stack(target_action_log_probs)
|
||||
b = tf.stack(behaviour_action_log_probs)
|
||||
log_rhos = tf.reduce_sum(t - b, axis=0)
|
||||
return log_rhos
|
||||
@@ -1,303 +0,0 @@
|
||||
"""Adapted from A3CTFPolicy to add V-trace.
|
||||
|
||||
Keep in sync with changes to A3CTFPolicy and VtraceSurrogatePolicy."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
import gym
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.impala import vtrace
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
|
||||
EntropyCoeffSchedule
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BEHAVIOUR_LOGITS = "behaviour_logits"
|
||||
|
||||
|
||||
class VTraceLoss(object):
|
||||
def __init__(self,
|
||||
actions,
|
||||
actions_logp,
|
||||
actions_entropy,
|
||||
dones,
|
||||
behaviour_logits,
|
||||
target_logits,
|
||||
discount,
|
||||
rewards,
|
||||
values,
|
||||
bootstrap_value,
|
||||
dist_class,
|
||||
valid_mask,
|
||||
vf_loss_coeff=0.5,
|
||||
entropy_coeff=0.01,
|
||||
clip_rho_threshold=1.0,
|
||||
clip_pg_rho_threshold=1.0):
|
||||
"""Policy gradient loss with vtrace importance weighting.
|
||||
|
||||
VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
|
||||
batch_size. The reason we need to know `B` is for V-trace to properly
|
||||
handle episode cut boundaries.
|
||||
|
||||
Args:
|
||||
actions: An int|float32 tensor of shape [T, B, ACTION_SPACE].
|
||||
actions_logp: A float32 tensor of shape [T, B].
|
||||
actions_entropy: A float32 tensor of shape [T, B].
|
||||
dones: A bool tensor of shape [T, B].
|
||||
behaviour_logits: A list with length of ACTION_SPACE of float32
|
||||
tensors of shapes
|
||||
[T, B, ACTION_SPACE[0]],
|
||||
...,
|
||||
[T, B, ACTION_SPACE[-1]]
|
||||
target_logits: A list with length of ACTION_SPACE of float32
|
||||
tensors of shapes
|
||||
[T, B, ACTION_SPACE[0]],
|
||||
...,
|
||||
[T, B, ACTION_SPACE[-1]]
|
||||
discount: A float32 scalar.
|
||||
rewards: A float32 tensor of shape [T, B].
|
||||
values: A float32 tensor of shape [T, B].
|
||||
bootstrap_value: A float32 tensor of shape [B].
|
||||
dist_class: action distribution class for logits.
|
||||
valid_mask: A bool tensor of valid RNN input elements (#2992).
|
||||
"""
|
||||
|
||||
# Compute vtrace on the CPU for better perf.
|
||||
with tf.device("/cpu:0"):
|
||||
self.vtrace_returns = vtrace.multi_from_logits(
|
||||
behaviour_policy_logits=behaviour_logits,
|
||||
target_policy_logits=target_logits,
|
||||
actions=tf.unstack(actions, axis=2),
|
||||
discounts=tf.to_float(~dones) * discount,
|
||||
rewards=rewards,
|
||||
values=values,
|
||||
bootstrap_value=bootstrap_value,
|
||||
dist_class=dist_class,
|
||||
clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
|
||||
clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
|
||||
tf.float32))
|
||||
self.value_targets = self.vtrace_returns.vs
|
||||
|
||||
# The policy gradients loss
|
||||
self.pi_loss = -tf.reduce_sum(
|
||||
tf.boolean_mask(actions_logp * self.vtrace_returns.pg_advantages,
|
||||
valid_mask))
|
||||
|
||||
# The baseline loss
|
||||
delta = tf.boolean_mask(values - self.vtrace_returns.vs, valid_mask)
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
|
||||
# The entropy loss
|
||||
self.entropy = tf.reduce_sum(
|
||||
tf.boolean_mask(actions_entropy, valid_mask))
|
||||
|
||||
# The summed weighted loss
|
||||
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
|
||||
self.entropy * entropy_coeff)
|
||||
|
||||
|
||||
def _make_time_major(policy, tensor, drop_last=False):
|
||||
"""Swaps batch and trajectory axis.
|
||||
|
||||
Arguments:
|
||||
policy: Policy reference
|
||||
tensor: A tensor or list of tensors to reshape.
|
||||
drop_last: A bool indicating whether to drop the last
|
||||
trajectory item.
|
||||
|
||||
Returns:
|
||||
res: A tensor with swapped axes or a list of tensors with
|
||||
swapped axes.
|
||||
"""
|
||||
if isinstance(tensor, list):
|
||||
return [_make_time_major(policy, t, drop_last) for t in tensor]
|
||||
|
||||
if policy.state_in:
|
||||
B = tf.shape(policy.seq_lens)[0]
|
||||
T = tf.shape(tensor)[0] // B
|
||||
else:
|
||||
# Important: chop the tensor into batches at known episode cut
|
||||
# boundaries. TODO(ekl) this is kind of a hack
|
||||
T = policy.config["sample_batch_size"]
|
||||
B = tf.shape(tensor)[0] // T
|
||||
rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0))
|
||||
|
||||
# swap B and T axes
|
||||
res = tf.transpose(
|
||||
rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))))
|
||||
|
||||
if drop_last:
|
||||
return res[:-1]
|
||||
return res
|
||||
|
||||
|
||||
def build_vtrace_loss(policy, batch_tensors):
|
||||
if isinstance(policy.action_space, gym.spaces.Discrete):
|
||||
is_multidiscrete = False
|
||||
output_hidden_shape = [policy.action_space.n]
|
||||
elif isinstance(policy.action_space,
|
||||
gym.spaces.multi_discrete.MultiDiscrete):
|
||||
is_multidiscrete = True
|
||||
output_hidden_shape = policy.action_space.nvec.astype(np.int32)
|
||||
else:
|
||||
is_multidiscrete = False
|
||||
output_hidden_shape = 1
|
||||
|
||||
def make_time_major(*args, **kw):
|
||||
return _make_time_major(policy, *args, **kw)
|
||||
|
||||
actions = batch_tensors[SampleBatch.ACTIONS]
|
||||
dones = batch_tensors[SampleBatch.DONES]
|
||||
rewards = batch_tensors[SampleBatch.REWARDS]
|
||||
behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
|
||||
unpacked_behaviour_logits = tf.split(
|
||||
behaviour_logits, output_hidden_shape, axis=1)
|
||||
unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1)
|
||||
action_dist = policy.action_dist
|
||||
values = policy.value_function
|
||||
|
||||
if policy.state_in:
|
||||
max_seq_len = tf.reduce_max(policy.seq_lens) - 1
|
||||
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
|
||||
mask = tf.reshape(mask, [-1])
|
||||
else:
|
||||
mask = tf.ones_like(rewards)
|
||||
|
||||
# Prepare actions for loss
|
||||
loss_actions = actions if is_multidiscrete else tf.expand_dims(
|
||||
actions, axis=1)
|
||||
|
||||
# Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc.
|
||||
policy.loss = VTraceLoss(
|
||||
actions=make_time_major(loss_actions, drop_last=True),
|
||||
actions_logp=make_time_major(
|
||||
action_dist.logp(actions), drop_last=True),
|
||||
actions_entropy=make_time_major(
|
||||
action_dist.multi_entropy(), drop_last=True),
|
||||
dones=make_time_major(dones, drop_last=True),
|
||||
behaviour_logits=make_time_major(
|
||||
unpacked_behaviour_logits, drop_last=True),
|
||||
target_logits=make_time_major(unpacked_outputs, drop_last=True),
|
||||
discount=policy.config["gamma"],
|
||||
rewards=make_time_major(rewards, drop_last=True),
|
||||
values=make_time_major(values, drop_last=True),
|
||||
bootstrap_value=make_time_major(values)[-1],
|
||||
dist_class=Categorical if is_multidiscrete else policy.dist_class,
|
||||
valid_mask=make_time_major(mask, drop_last=True),
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
|
||||
clip_pg_rho_threshold=policy.config["vtrace_clip_pg_rho_threshold"])
|
||||
|
||||
return policy.loss.total_loss
|
||||
|
||||
|
||||
def stats(policy, batch_tensors):
|
||||
values_batched = _make_time_major(
|
||||
policy, policy.value_function, drop_last=policy.config["vtrace"])
|
||||
|
||||
return {
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"entropy": policy.loss.entropy,
|
||||
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
|
||||
"var_gnorm": tf.global_norm(policy.var_list),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
tf.reshape(policy.loss.value_targets, [-1]),
|
||||
tf.reshape(values_batched, [-1])),
|
||||
}
|
||||
|
||||
|
||||
def grad_stats(policy, grads):
|
||||
return {
|
||||
"grad_gnorm": tf.global_norm(grads),
|
||||
}
|
||||
|
||||
|
||||
def postprocess_trajectory(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
# not used, so save some bandwidth
|
||||
del sample_batch.data[SampleBatch.NEXT_OBS]
|
||||
return sample_batch
|
||||
|
||||
|
||||
def add_behaviour_logits(policy):
|
||||
return {BEHAVIOUR_LOGITS: policy.model_out}
|
||||
|
||||
|
||||
def validate_config(policy, obs_space, action_space, config):
|
||||
if config["vtrace"]:
|
||||
assert config["batch_mode"] == "truncate_episodes", \
|
||||
"Must use `truncate_episodes` batch mode with V-trace."
|
||||
|
||||
|
||||
def choose_optimizer(policy, config):
|
||||
if policy.config["opt_type"] == "adam":
|
||||
return tf.train.AdamOptimizer(policy.cur_lr)
|
||||
else:
|
||||
return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
|
||||
config["momentum"], config["epsilon"])
|
||||
|
||||
|
||||
def clip_gradients(policy, optimizer, loss):
|
||||
grads = tf.gradients(loss, policy.var_list)
|
||||
policy.grads, _ = tf.clip_by_global_norm(grads, policy.config["grad_clip"])
|
||||
clipped_grads = list(zip(policy.grads, policy.var_list))
|
||||
return clipped_grads
|
||||
|
||||
|
||||
class ValueNetworkMixin(object):
|
||||
def __init__(self):
|
||||
self.value_function = self.model.value_function()
|
||||
self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
|
||||
def value(self, ob, *args):
|
||||
feed_dict = {
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): [ob],
|
||||
self.seq_lens: [1]
|
||||
}
|
||||
assert len(args) == len(self.state_in), \
|
||||
(args, self.state_in)
|
||||
for k, v in zip(self.state_in, args):
|
||||
feed_dict[k] = v
|
||||
vf = self.get_session().run(self.value_function, feed_dict)
|
||||
return vf[0]
|
||||
|
||||
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
|
||||
config["entropy_coeff_schedule"])
|
||||
ValueNetworkMixin.__init__(policy)
|
||||
|
||||
|
||||
VTraceTFPolicy = build_tf_policy(
|
||||
name="VTraceTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.impala.impala.DEFAULT_CONFIG,
|
||||
loss_fn=build_vtrace_loss,
|
||||
stats_fn=stats,
|
||||
grad_stats_fn=grad_stats,
|
||||
postprocess_fn=postprocess_trajectory,
|
||||
optimizer_fn=choose_optimizer,
|
||||
gradients_fn=clip_gradients,
|
||||
extra_action_fetches_fn=add_behaviour_logits,
|
||||
before_init=validate_config,
|
||||
before_loss_init=setup_mixins,
|
||||
mixins=[LearningRateSchedule, EntropyCoeffSchedule, ValueNetworkMixin],
|
||||
get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
|
||||
@@ -1,270 +0,0 @@
|
||||
# Copyright 2018 Google LLC
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# https://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""Tests for V-trace.
|
||||
|
||||
For details and theory see:
|
||||
|
||||
"IMPALA: Scalable Distributed Deep-RL with
|
||||
Importance Weighted Actor-Learner Architectures"
|
||||
by Espeholt, Soyer, Munos et al.
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from absl.testing import parameterized
|
||||
import numpy as np
|
||||
import vtrace
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
def _shaped_arange(*shape):
|
||||
"""Runs np.arange, converts to float and reshapes."""
|
||||
return np.arange(np.prod(shape), dtype=np.float32).reshape(*shape)
|
||||
|
||||
|
||||
def _softmax(logits):
|
||||
"""Applies softmax non-linearity on inputs."""
|
||||
return np.exp(logits) / np.sum(np.exp(logits), axis=-1, keepdims=True)
|
||||
|
||||
|
||||
def _ground_truth_calculation(discounts, log_rhos, rewards, values,
|
||||
bootstrap_value, clip_rho_threshold,
|
||||
clip_pg_rho_threshold):
|
||||
"""Calculates the ground truth for V-trace in Python/Numpy."""
|
||||
vs = []
|
||||
seq_len = len(discounts)
|
||||
rhos = np.exp(log_rhos)
|
||||
cs = np.minimum(rhos, 1.0)
|
||||
clipped_rhos = rhos
|
||||
if clip_rho_threshold:
|
||||
clipped_rhos = np.minimum(rhos, clip_rho_threshold)
|
||||
clipped_pg_rhos = rhos
|
||||
if clip_pg_rho_threshold:
|
||||
clipped_pg_rhos = np.minimum(rhos, clip_pg_rho_threshold)
|
||||
|
||||
# This is a very inefficient way to calculate the V-trace ground truth.
|
||||
# We calculate it this way because it is close to the mathematical notation
|
||||
# of
|
||||
# V-trace.
|
||||
# v_s = V(x_s)
|
||||
# + \sum^{T-1}_{t=s} \gamma^{t-s}
|
||||
# * \prod_{i=s}^{t-1} c_i
|
||||
# * \rho_t (r_t + \gamma V(x_{t+1}) - V(x_t))
|
||||
# Note that when we take the product over c_i, we write `s:t` as the
|
||||
# notation
|
||||
# of the paper is inclusive of the `t-1`, but Python is exclusive.
|
||||
# Also note that np.prod([]) == 1.
|
||||
values_t_plus_1 = np.concatenate(
|
||||
[values, bootstrap_value[None, :]], axis=0)
|
||||
for s in range(seq_len):
|
||||
v_s = np.copy(values[s]) # Very important copy.
|
||||
for t in range(s, seq_len):
|
||||
v_s += (np.prod(discounts[s:t], axis=0) * np.prod(cs[s:t], axis=0)
|
||||
* clipped_rhos[t] * (rewards[t] + discounts[t] *
|
||||
values_t_plus_1[t + 1] - values[t]))
|
||||
vs.append(v_s)
|
||||
vs = np.stack(vs, axis=0)
|
||||
pg_advantages = (clipped_pg_rhos * (rewards + discounts * np.concatenate(
|
||||
[vs[1:], bootstrap_value[None, :]], axis=0) - values))
|
||||
|
||||
return vtrace.VTraceReturns(vs=vs, pg_advantages=pg_advantages)
|
||||
|
||||
|
||||
class LogProbsFromLogitsAndActionsTest(tf.test.TestCase,
|
||||
parameterized.TestCase):
|
||||
@parameterized.named_parameters(("Batch1", 1), ("Batch2", 2))
|
||||
def test_log_probs_from_logits_and_actions(self, batch_size):
|
||||
"""Tests log_probs_from_logits_and_actions."""
|
||||
seq_len = 7
|
||||
num_actions = 3
|
||||
|
||||
policy_logits = _shaped_arange(seq_len, batch_size, num_actions) + 10
|
||||
actions = np.random.randint(
|
||||
0, num_actions - 1, size=(seq_len, batch_size), dtype=np.int32)
|
||||
|
||||
action_log_probs_tensor = vtrace.log_probs_from_logits_and_actions(
|
||||
policy_logits, actions)
|
||||
|
||||
# Ground Truth
|
||||
# Using broadcasting to create a mask that indexes action logits
|
||||
action_index_mask = actions[..., None] == np.arange(num_actions)
|
||||
|
||||
def index_with_mask(array, mask):
|
||||
return array[mask].reshape(*array.shape[:-1])
|
||||
|
||||
# Note: Normally log(softmax) is not a good idea because it's not
|
||||
# numerically stable. However, in this test we have well-behaved
|
||||
# values.
|
||||
ground_truth_v = index_with_mask(
|
||||
np.log(_softmax(policy_logits)), action_index_mask)
|
||||
|
||||
with self.test_session() as session:
|
||||
self.assertAllClose(ground_truth_v,
|
||||
session.run(action_log_probs_tensor))
|
||||
|
||||
|
||||
class VtraceTest(tf.test.TestCase, parameterized.TestCase):
|
||||
@parameterized.named_parameters(("Batch1", 1), ("Batch5", 5))
|
||||
def test_vtrace(self, batch_size):
|
||||
"""Tests V-trace against ground truth data calculated in python."""
|
||||
seq_len = 5
|
||||
|
||||
# Create log_rhos such that rho will span from near-zero to above the
|
||||
# clipping thresholds. In particular, calculate log_rhos in
|
||||
# [-2.5, 2.5),
|
||||
# so that rho is in approx [0.08, 12.2).
|
||||
log_rhos = _shaped_arange(seq_len, batch_size) / (batch_size * seq_len)
|
||||
log_rhos = 5 * (log_rhos - 0.5) # [0.0, 1.0) -> [-2.5, 2.5).
|
||||
values = {
|
||||
"log_rhos": log_rhos,
|
||||
# T, B where B_i: [0.9 / (i+1)] * T
|
||||
"discounts": np.array([[0.9 / (b + 1) for b in range(batch_size)]
|
||||
for _ in range(seq_len)]),
|
||||
"rewards": _shaped_arange(seq_len, batch_size),
|
||||
"values": _shaped_arange(seq_len, batch_size) / batch_size,
|
||||
"bootstrap_value": _shaped_arange(batch_size) + 1.0,
|
||||
"clip_rho_threshold": 3.7,
|
||||
"clip_pg_rho_threshold": 2.2,
|
||||
}
|
||||
|
||||
output = vtrace.from_importance_weights(**values)
|
||||
|
||||
with self.test_session() as session:
|
||||
output_v = session.run(output)
|
||||
|
||||
ground_truth_v = _ground_truth_calculation(**values)
|
||||
for a, b in zip(ground_truth_v, output_v):
|
||||
self.assertAllClose(a, b)
|
||||
|
||||
@parameterized.named_parameters(("Batch1", 1), ("Batch2", 2))
|
||||
def test_vtrace_from_logits(self, batch_size):
|
||||
"""Tests V-trace calculated from logits."""
|
||||
seq_len = 5
|
||||
num_actions = 3
|
||||
clip_rho_threshold = None # No clipping.
|
||||
clip_pg_rho_threshold = None # No clipping.
|
||||
|
||||
# Intentionally leaving shapes unspecified to test if V-trace can
|
||||
# deal with that.
|
||||
placeholders = {
|
||||
# T, B, NUM_ACTIONS
|
||||
"behaviour_policy_logits": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, None]),
|
||||
# T, B, NUM_ACTIONS
|
||||
"target_policy_logits": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, None]),
|
||||
"actions": tf.placeholder(dtype=tf.int32, shape=[None, None]),
|
||||
"discounts": tf.placeholder(dtype=tf.float32, shape=[None, None]),
|
||||
"rewards": tf.placeholder(dtype=tf.float32, shape=[None, None]),
|
||||
"values": tf.placeholder(dtype=tf.float32, shape=[None, None]),
|
||||
"bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None]),
|
||||
}
|
||||
|
||||
from_logits_output = vtrace.from_logits(
|
||||
clip_rho_threshold=clip_rho_threshold,
|
||||
clip_pg_rho_threshold=clip_pg_rho_threshold,
|
||||
**placeholders)
|
||||
|
||||
target_log_probs = vtrace.log_probs_from_logits_and_actions(
|
||||
placeholders["target_policy_logits"], placeholders["actions"])
|
||||
behaviour_log_probs = vtrace.log_probs_from_logits_and_actions(
|
||||
placeholders["behaviour_policy_logits"], placeholders["actions"])
|
||||
log_rhos = target_log_probs - behaviour_log_probs
|
||||
ground_truth = (log_rhos, behaviour_log_probs, target_log_probs)
|
||||
|
||||
values = {
|
||||
"behaviour_policy_logits": _shaped_arange(seq_len, batch_size,
|
||||
num_actions),
|
||||
"target_policy_logits": _shaped_arange(seq_len, batch_size,
|
||||
num_actions),
|
||||
"actions": np.random.randint(
|
||||
0, num_actions - 1, size=(seq_len, batch_size)),
|
||||
"discounts": np.array( # T, B where B_i: [0.9 / (i+1)] * T
|
||||
[[0.9 / (b + 1) for b in range(batch_size)]
|
||||
for _ in range(seq_len)]),
|
||||
"rewards": _shaped_arange(seq_len, batch_size),
|
||||
"values": _shaped_arange(seq_len, batch_size) / batch_size,
|
||||
"bootstrap_value": _shaped_arange(batch_size) + 1.0, # B
|
||||
}
|
||||
|
||||
feed_dict = {placeholders[k]: v for k, v in values.items()}
|
||||
with self.test_session() as session:
|
||||
from_logits_output_v = session.run(
|
||||
from_logits_output, feed_dict=feed_dict)
|
||||
(ground_truth_log_rhos, ground_truth_behaviour_action_log_probs,
|
||||
ground_truth_target_action_log_probs) = session.run(
|
||||
ground_truth, feed_dict=feed_dict)
|
||||
|
||||
# Calculate V-trace using the ground truth logits.
|
||||
from_iw = vtrace.from_importance_weights(
|
||||
log_rhos=ground_truth_log_rhos,
|
||||
discounts=values["discounts"],
|
||||
rewards=values["rewards"],
|
||||
values=values["values"],
|
||||
bootstrap_value=values["bootstrap_value"],
|
||||
clip_rho_threshold=clip_rho_threshold,
|
||||
clip_pg_rho_threshold=clip_pg_rho_threshold)
|
||||
|
||||
with self.test_session() as session:
|
||||
from_iw_v = session.run(from_iw)
|
||||
|
||||
self.assertAllClose(from_iw_v.vs, from_logits_output_v.vs)
|
||||
self.assertAllClose(from_iw_v.pg_advantages,
|
||||
from_logits_output_v.pg_advantages)
|
||||
self.assertAllClose(ground_truth_behaviour_action_log_probs,
|
||||
from_logits_output_v.behaviour_action_log_probs)
|
||||
self.assertAllClose(ground_truth_target_action_log_probs,
|
||||
from_logits_output_v.target_action_log_probs)
|
||||
self.assertAllClose(ground_truth_log_rhos,
|
||||
from_logits_output_v.log_rhos)
|
||||
|
||||
def test_higher_rank_inputs_for_importance_weights(self):
|
||||
"""Checks support for additional dimensions in inputs."""
|
||||
placeholders = {
|
||||
"log_rhos": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"discounts": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"rewards": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 42]),
|
||||
"values": tf.placeholder(dtype=tf.float32, shape=[None, None, 42]),
|
||||
"bootstrap_value": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, 42])
|
||||
}
|
||||
output = vtrace.from_importance_weights(**placeholders)
|
||||
self.assertEqual(output.vs.shape.as_list()[-1], 42)
|
||||
|
||||
def test_inconsistent_rank_inputs_for_importance_weights(self):
|
||||
"""Test one of many possible errors in shape of inputs."""
|
||||
placeholders = {
|
||||
"log_rhos": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"discounts": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"rewards": tf.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 42]),
|
||||
"values": tf.placeholder(dtype=tf.float32, shape=[None, None, 42]),
|
||||
# Should be [None, 42].
|
||||
"bootstrap_value": tf.placeholder(dtype=tf.float32, shape=[None])
|
||||
}
|
||||
with self.assertRaisesRegexp(ValueError, "must have rank 2"):
|
||||
vtrace.from_importance_weights(**placeholders)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
tf.test.main()
|
||||
@@ -1,7 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.marwil.marwil import MARWILTrainer, DEFAULT_CONFIG
|
||||
|
||||
__all__ = ["MARWILTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,55 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.agents.marwil.marwil_policy import MARWILPolicy
|
||||
from ray.rllib.optimizers import SyncBatchReplayOptimizer
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# You should override this to point to an offline dataset (see agent.py).
|
||||
"input": "sampler",
|
||||
# Use importance sampling estimators for reward
|
||||
"input_evaluation": ["is", "wis"],
|
||||
|
||||
# Scaling of advantages in exponential terms
|
||||
# When beta is 0, MARWIL is reduced to imitation learning
|
||||
"beta": 1.0,
|
||||
# Balancing value estimation loss and policy optimization loss
|
||||
"vf_coeff": 1.0,
|
||||
# Whether to calculate cumulative rewards
|
||||
"postprocess_inputs": True,
|
||||
# Whether to rollout "complete_episodes" or "truncate_episodes"
|
||||
"batch_mode": "complete_episodes",
|
||||
# Learning rate for adam optimizer
|
||||
"lr": 1e-4,
|
||||
# Number of timesteps collected for each SGD round
|
||||
"train_batch_size": 2000,
|
||||
# Number of steps max to keep in the batch replay buffer
|
||||
"replay_buffer_size": 100000,
|
||||
# Number of steps to read before learning starts
|
||||
"learning_starts": 0,
|
||||
# === Parallelism ===
|
||||
"num_workers": 0,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def make_optimizer(workers, config):
|
||||
return SyncBatchReplayOptimizer(
|
||||
workers,
|
||||
learning_starts=config["learning_starts"],
|
||||
buffer_size=config["replay_buffer_size"],
|
||||
train_batch_size=config["train_batch_size"],
|
||||
)
|
||||
|
||||
|
||||
MARWILTrainer = build_trainer(
|
||||
name="MARWIL",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=MARWILPolicy,
|
||||
make_policy_optimizer=make_optimizer)
|
||||
@@ -1,175 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.policy.tf_policy import TFPolicy
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import scope_vars
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
POLICY_SCOPE = "p_func"
|
||||
VALUE_SCOPE = "v_func"
|
||||
|
||||
|
||||
class ValueLoss(object):
|
||||
def __init__(self, state_values, cumulative_rewards):
|
||||
self.loss = 0.5 * tf.reduce_mean(
|
||||
tf.square(state_values - cumulative_rewards))
|
||||
|
||||
|
||||
class ReweightedImitationLoss(object):
|
||||
def __init__(self, state_values, cumulative_rewards, logits, actions,
|
||||
action_space, beta):
|
||||
ma_adv_norm = tf.get_variable(
|
||||
name="moving_average_of_advantage_norm",
|
||||
dtype=tf.float32,
|
||||
initializer=100.0,
|
||||
trainable=False)
|
||||
# advantage estimation
|
||||
adv = cumulative_rewards - state_values
|
||||
# update averaged advantage norm
|
||||
update_adv_norm = tf.assign_add(
|
||||
ref=ma_adv_norm,
|
||||
value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm))
|
||||
|
||||
# exponentially weighted advantages
|
||||
with tf.control_dependencies([update_adv_norm]):
|
||||
exp_advs = tf.exp(
|
||||
beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm)))
|
||||
|
||||
# log\pi_\theta(a|s)
|
||||
dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
|
||||
action_dist = dist_cls(logits)
|
||||
logprobs = action_dist.logp(actions)
|
||||
|
||||
self.loss = -1.0 * tf.reduce_mean(
|
||||
tf.stop_gradient(exp_advs) * logprobs)
|
||||
|
||||
|
||||
class MARWILPostprocessing(object):
|
||||
"""Adds the advantages field to the trajectory."""
|
||||
|
||||
@override(Policy)
|
||||
def postprocess_trajectory(self,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
completed = sample_batch["dones"][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"last done mask in a batch should be True. "
|
||||
"For now, we only support reading experience batches produced "
|
||||
"with batch_mode='complete_episodes'.",
|
||||
len(sample_batch[SampleBatch.DONES]),
|
||||
sample_batch[SampleBatch.DONES][-1])
|
||||
batch = compute_advantages(
|
||||
sample_batch, last_r, gamma=self.config["gamma"], use_gae=False)
|
||||
return batch
|
||||
|
||||
|
||||
class MARWILPolicy(MARWILPostprocessing, TFPolicy):
|
||||
def __init__(self, observation_space, action_space, config):
|
||||
config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config)
|
||||
self.config = config
|
||||
|
||||
dist_cls, logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space, self.config["model"])
|
||||
|
||||
# Action inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
tf.float32, shape=(None, ) + observation_space.shape)
|
||||
prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
|
||||
prev_rewards_ph = tf.placeholder(
|
||||
tf.float32, [None], name="prev_reward")
|
||||
|
||||
with tf.variable_scope(POLICY_SCOPE) as scope:
|
||||
self.model = ModelCatalog.get_model({
|
||||
"obs": self.obs_t,
|
||||
"prev_actions": prev_actions_ph,
|
||||
"prev_rewards": prev_rewards_ph,
|
||||
"is_training": self._get_is_training_placeholder(),
|
||||
}, observation_space, action_space, logit_dim,
|
||||
self.config["model"])
|
||||
logits = self.model.outputs
|
||||
self.p_func_vars = scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
action_dist = dist_cls(logits)
|
||||
self.output_actions = action_dist.sample()
|
||||
|
||||
# Training inputs
|
||||
self.act_t = ModelCatalog.get_action_placeholder(action_space)
|
||||
self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward")
|
||||
|
||||
# v network evaluation
|
||||
with tf.variable_scope(VALUE_SCOPE) as scope:
|
||||
state_values = self.model.value_function()
|
||||
self.v_func_vars = scope_vars(scope.name)
|
||||
self.v_loss = self._build_value_loss(state_values, self.cum_rew_t)
|
||||
self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t,
|
||||
logits, self.act_t, action_space)
|
||||
|
||||
# which kind of objective to optimize
|
||||
objective = (
|
||||
self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss)
|
||||
self.explained_variance = tf.reduce_mean(
|
||||
explained_variance(self.cum_rew_t, state_values))
|
||||
|
||||
# initialize TFPolicy
|
||||
self.sess = tf.get_default_session()
|
||||
self.loss_inputs = [
|
||||
(SampleBatch.CUR_OBS, self.obs_t),
|
||||
(SampleBatch.ACTIONS, self.act_t),
|
||||
(Postprocessing.ADVANTAGES, self.cum_rew_t),
|
||||
]
|
||||
TFPolicy.__init__(
|
||||
self,
|
||||
observation_space,
|
||||
action_space,
|
||||
self.sess,
|
||||
obs_input=self.obs_t,
|
||||
action_sampler=self.output_actions,
|
||||
action_prob=action_dist.sampled_action_prob(),
|
||||
loss=objective,
|
||||
model=self.model,
|
||||
loss_inputs=self.loss_inputs,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
prev_action_input=prev_actions_ph,
|
||||
prev_reward_input=prev_rewards_ph)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
self.stats_fetches = {
|
||||
"total_loss": objective,
|
||||
"vf_explained_var": self.explained_variance,
|
||||
"policy_loss": self.p_loss.loss,
|
||||
"vf_loss": self.v_loss.loss
|
||||
}
|
||||
|
||||
def _build_value_loss(self, state_values, cum_rwds):
|
||||
return ValueLoss(state_values, cum_rwds)
|
||||
|
||||
def _build_policy_loss(self, state_values, cum_rwds, logits, actions,
|
||||
action_space):
|
||||
return ReweightedImitationLoss(state_values, cum_rwds, logits, actions,
|
||||
action_space, self.config["beta"])
|
||||
|
||||
@override(TFPolicy)
|
||||
def extra_compute_grad_fetches(self):
|
||||
return {LEARNER_STATS_KEY: self.stats_fetches}
|
||||
|
||||
@override(Policy)
|
||||
def get_initial_state(self):
|
||||
return self.model.state_init
|
||||
@@ -1,128 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import pickle
|
||||
import numpy as np
|
||||
|
||||
from ray.tune import result as tune_result
|
||||
from ray.rllib.agents.trainer import Trainer, with_common_config
|
||||
|
||||
|
||||
class _MockTrainer(Trainer):
|
||||
"""Mock trainer for use in tests"""
|
||||
|
||||
_name = "MockTrainer"
|
||||
_default_config = with_common_config({
|
||||
"mock_error": False,
|
||||
"persistent_error": False,
|
||||
"test_variable": 1,
|
||||
"num_workers": 0,
|
||||
"user_checkpoint_freq": 0,
|
||||
})
|
||||
|
||||
@classmethod
|
||||
def default_resource_request(cls, config):
|
||||
return None
|
||||
|
||||
def _init(self, config, env_creator):
|
||||
self.info = None
|
||||
self.restored = False
|
||||
|
||||
def _train(self):
|
||||
if self.config["mock_error"] and self.iteration == 1 \
|
||||
and (self.config["persistent_error"] or not self.restored):
|
||||
raise Exception("mock error")
|
||||
result = dict(
|
||||
episode_reward_mean=10,
|
||||
episode_len_mean=10,
|
||||
timesteps_this_iter=10,
|
||||
info={})
|
||||
if self.config["user_checkpoint_freq"] > 0 and self.iteration > 0:
|
||||
if self.iteration % self.config["user_checkpoint_freq"] == 0:
|
||||
result.update({tune_result.SHOULD_CHECKPOINT: True})
|
||||
return result
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
path = os.path.join(checkpoint_dir, "mock_agent.pkl")
|
||||
with open(path, "wb") as f:
|
||||
pickle.dump(self.info, f)
|
||||
return path
|
||||
|
||||
def _restore(self, checkpoint_path):
|
||||
with open(checkpoint_path, "rb") as f:
|
||||
info = pickle.load(f)
|
||||
self.info = info
|
||||
self.restored = True
|
||||
|
||||
def _register_if_needed(self, env_object):
|
||||
pass
|
||||
|
||||
def set_info(self, info):
|
||||
self.info = info
|
||||
return info
|
||||
|
||||
def get_info(self):
|
||||
return self.info
|
||||
|
||||
|
||||
class _SigmoidFakeData(_MockTrainer):
|
||||
"""Trainer that returns sigmoid learning curves.
|
||||
|
||||
This can be helpful for evaluating early stopping algorithms."""
|
||||
|
||||
_name = "SigmoidFakeData"
|
||||
_default_config = with_common_config({
|
||||
"width": 100,
|
||||
"height": 100,
|
||||
"offset": 0,
|
||||
"iter_time": 10,
|
||||
"iter_timesteps": 1,
|
||||
"num_workers": 0,
|
||||
})
|
||||
|
||||
def _train(self):
|
||||
i = max(0, self.iteration - self.config["offset"])
|
||||
v = np.tanh(float(i) / self.config["width"])
|
||||
v *= self.config["height"]
|
||||
return dict(
|
||||
episode_reward_mean=v,
|
||||
episode_len_mean=v,
|
||||
timesteps_this_iter=self.config["iter_timesteps"],
|
||||
time_this_iter_s=self.config["iter_time"],
|
||||
info={})
|
||||
|
||||
|
||||
class _ParameterTuningTrainer(_MockTrainer):
|
||||
|
||||
_name = "ParameterTuningTrainer"
|
||||
_default_config = with_common_config({
|
||||
"reward_amt": 10,
|
||||
"dummy_param": 10,
|
||||
"dummy_param2": 15,
|
||||
"iter_time": 10,
|
||||
"iter_timesteps": 1,
|
||||
"num_workers": 0,
|
||||
})
|
||||
|
||||
def _train(self):
|
||||
return dict(
|
||||
episode_reward_mean=self.config["reward_amt"] * self.iteration,
|
||||
episode_len_mean=self.config["reward_amt"],
|
||||
timesteps_this_iter=self.config["iter_timesteps"],
|
||||
time_this_iter_s=self.config["iter_time"],
|
||||
info={})
|
||||
|
||||
|
||||
def _agent_import_failed(trace):
|
||||
"""Returns dummy agent class for if PyTorch etc. is not installed."""
|
||||
|
||||
class _AgentImportFailed(Trainer):
|
||||
_name = "AgentImportFailed"
|
||||
_default_config = with_common_config({})
|
||||
|
||||
def _setup(self, config):
|
||||
raise ImportError(trace)
|
||||
|
||||
return _AgentImportFailed
|
||||
@@ -1,6 +0,0 @@
|
||||
from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
PGAgent = renamed_agent(PGTrainer)
|
||||
|
||||
__all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,35 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.agents.pg.pg_policy import PGTFPolicy
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# No remote workers by default
|
||||
"num_workers": 0,
|
||||
# Learning rate
|
||||
"lr": 0.0004,
|
||||
# Use PyTorch as backend
|
||||
"use_pytorch": False,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def get_policy_class(config):
|
||||
if config["use_pytorch"]:
|
||||
from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
|
||||
return PGTorchPolicy
|
||||
else:
|
||||
return PGTFPolicy
|
||||
|
||||
|
||||
PGTrainer = build_trainer(
|
||||
name="PG",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=PGTFPolicy,
|
||||
get_policy_class=get_policy_class)
|
||||
@@ -1,35 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
# The basic policy gradients loss
|
||||
def policy_gradient_loss(policy, batch_tensors):
|
||||
actions = batch_tensors[SampleBatch.ACTIONS]
|
||||
advantages = batch_tensors[Postprocessing.ADVANTAGES]
|
||||
return -tf.reduce_mean(policy.action_dist.logp(actions) * advantages)
|
||||
|
||||
|
||||
# This adds the "advantages" column to the sample batch.
|
||||
def postprocess_advantages(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
return compute_advantages(
|
||||
sample_batch, 0.0, policy.config["gamma"], use_gae=False)
|
||||
|
||||
|
||||
PGTFPolicy = build_tf_policy(
|
||||
name="PGTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
|
||||
postprocess_fn=postprocess_advantages,
|
||||
loss_fn=policy_gradient_loss)
|
||||
@@ -1,42 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.torch_policy_template import build_torch_policy
|
||||
|
||||
|
||||
def pg_torch_loss(policy, batch_tensors):
|
||||
logits, _ = policy.model({
|
||||
SampleBatch.CUR_OBS: batch_tensors[SampleBatch.CUR_OBS]
|
||||
})
|
||||
action_dist = policy.dist_class(logits)
|
||||
log_probs = action_dist.logp(batch_tensors[SampleBatch.ACTIONS])
|
||||
# save the error in the policy object
|
||||
policy.pi_err = -batch_tensors[Postprocessing.ADVANTAGES].dot(
|
||||
log_probs.reshape(-1))
|
||||
return policy.pi_err
|
||||
|
||||
|
||||
def postprocess_advantages(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
return compute_advantages(
|
||||
sample_batch, 0.0, policy.config["gamma"], use_gae=False)
|
||||
|
||||
|
||||
def pg_loss_stats(policy, batch_tensors):
|
||||
# the error is recorded when computing the loss
|
||||
return {"policy_loss": policy.pi_err.item()}
|
||||
|
||||
|
||||
PGTorchPolicy = build_torch_policy(
|
||||
name="PGTorchPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
|
||||
loss_fn=pg_torch_loss,
|
||||
stats_fn=pg_loss_stats,
|
||||
postprocess_fn=postprocess_advantages)
|
||||
@@ -1,7 +0,0 @@
|
||||
from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.agents.ppo.appo import APPOTrainer
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
PPOAgent = renamed_agent(PPOTrainer)
|
||||
|
||||
__all__ = ["PPOAgent", "APPOTrainer", "PPOTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,91 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.ppo.appo_policy import AsyncPPOTFPolicy
|
||||
from ray.rllib.agents.trainer import with_base_config
|
||||
from ray.rllib.agents.ppo.ppo import update_kl
|
||||
from ray.rllib.agents import impala
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_base_config(impala.DEFAULT_CONFIG, {
|
||||
# Whether to use V-trace weighted advantages. If false, PPO GAE advantages
|
||||
# will be used instead.
|
||||
"vtrace": False,
|
||||
|
||||
# == These two options only apply if vtrace: False ==
|
||||
# If true, use the Generalized Advantage Estimator (GAE)
|
||||
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
|
||||
"use_gae": True,
|
||||
# GAE(lambda) parameter
|
||||
"lambda": 1.0,
|
||||
|
||||
# == PPO surrogate loss options ==
|
||||
"clip_param": 0.4,
|
||||
|
||||
# == PPO KL Loss options ==
|
||||
"use_kl_loss": False,
|
||||
"kl_coeff": 1.0,
|
||||
"kl_target": 0.01,
|
||||
|
||||
# == IMPALA optimizer params (see documentation in impala.py) ==
|
||||
"sample_batch_size": 50,
|
||||
"train_batch_size": 500,
|
||||
"min_iter_time_s": 10,
|
||||
"num_workers": 2,
|
||||
"num_gpus": 0,
|
||||
"num_data_loader_buffers": 1,
|
||||
"minibatch_buffer_size": 1,
|
||||
"num_sgd_iter": 1,
|
||||
"replay_proportion": 0.0,
|
||||
"replay_buffer_num_slots": 100,
|
||||
"learner_queue_size": 16,
|
||||
"learner_queue_timeout": 300,
|
||||
"max_sample_requests_in_flight_per_worker": 2,
|
||||
"broadcast_interval": 1,
|
||||
"grad_clip": 40.0,
|
||||
"opt_type": "adam",
|
||||
"lr": 0.0005,
|
||||
"lr_schedule": None,
|
||||
"decay": 0.99,
|
||||
"momentum": 0.0,
|
||||
"epsilon": 0.1,
|
||||
"vf_loss_coeff": 0.5,
|
||||
"entropy_coeff": 0.01,
|
||||
"entropy_coeff_schedule": None,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def update_target_and_kl(trainer, fetches):
|
||||
# Update the KL coeff depending on how many steps LearnerThread has stepped
|
||||
# through
|
||||
learner_steps = trainer.optimizer.learner.num_steps
|
||||
if learner_steps >= trainer.target_update_frequency:
|
||||
|
||||
# Update Target Network
|
||||
trainer.optimizer.learner.num_steps = 0
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.update_target())
|
||||
|
||||
# Also update KL Coeff
|
||||
if trainer.config["use_kl_loss"]:
|
||||
update_kl(trainer, trainer.optimizer.learner.stats)
|
||||
|
||||
|
||||
def initialize_target(trainer):
|
||||
trainer.workers.local_worker().foreach_trainable_policy(
|
||||
lambda p, _: p.update_target())
|
||||
trainer.target_update_frequency = trainer.config["num_sgd_iter"] \
|
||||
* trainer.config["minibatch_buffer_size"]
|
||||
|
||||
|
||||
APPOTrainer = impala.ImpalaTrainer.with_updates(
|
||||
name="APPO",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=AsyncPPOTFPolicy,
|
||||
get_policy_class=lambda _: AsyncPPOTFPolicy,
|
||||
after_init=initialize_target,
|
||||
after_optimizer_step=update_target_and_kl)
|
||||
@@ -1,440 +0,0 @@
|
||||
"""Adapted from VTraceTFPolicy to use the PPO surrogate loss.
|
||||
|
||||
Keep in sync with changes to VTraceTFPolicy."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
import gym
|
||||
|
||||
from ray.rllib.agents.impala import vtrace
|
||||
from ray.rllib.agents.impala.vtrace_policy import _make_time_major, \
|
||||
BEHAVIOUR_LOGITS, clip_gradients, \
|
||||
validate_config, choose_optimizer, ValueNetworkMixin
|
||||
from ray.rllib.evaluation.postprocessing import Postprocessing
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule
|
||||
from ray.rllib.agents.ppo.ppo_policy import KLCoeffMixin
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
POLICY_SCOPE = "func"
|
||||
TARGET_POLICY_SCOPE = "target_func"
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PPOSurrogateLoss(object):
|
||||
"""Loss used when V-trace is disabled.
|
||||
|
||||
Arguments:
|
||||
prev_actions_logp: A float32 tensor of shape [T, B].
|
||||
actions_logp: A float32 tensor of shape [T, B].
|
||||
action_kl: A float32 tensor of shape [T, B].
|
||||
actions_entropy: A float32 tensor of shape [T, B].
|
||||
values: A float32 tensor of shape [T, B].
|
||||
valid_mask: A bool tensor of valid RNN input elements (#2992).
|
||||
advantages: A float32 tensor of shape [T, B].
|
||||
value_targets: A float32 tensor of shape [T, B].
|
||||
vf_loss_coeff (float): Coefficient of the value function loss.
|
||||
entropy_coeff (float): Coefficient of the entropy regularizer.
|
||||
clip_param (float): Clip parameter.
|
||||
cur_kl_coeff (float): Coefficient for KL loss.
|
||||
use_kl_loss (bool): If true, use KL loss.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
prev_actions_logp,
|
||||
actions_logp,
|
||||
action_kl,
|
||||
actions_entropy,
|
||||
values,
|
||||
valid_mask,
|
||||
advantages,
|
||||
value_targets,
|
||||
vf_loss_coeff=0.5,
|
||||
entropy_coeff=0.01,
|
||||
clip_param=0.3,
|
||||
cur_kl_coeff=None,
|
||||
use_kl_loss=False):
|
||||
def reduce_mean_valid(t):
|
||||
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
|
||||
|
||||
logp_ratio = tf.exp(actions_logp - prev_actions_logp)
|
||||
|
||||
surrogate_loss = tf.minimum(
|
||||
advantages * logp_ratio,
|
||||
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
|
||||
1 + clip_param))
|
||||
|
||||
self.mean_kl = reduce_mean_valid(action_kl)
|
||||
self.pi_loss = -reduce_mean_valid(surrogate_loss)
|
||||
|
||||
# The baseline loss
|
||||
delta = values - value_targets
|
||||
self.value_targets = value_targets
|
||||
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
|
||||
|
||||
# The entropy loss
|
||||
self.entropy = reduce_mean_valid(actions_entropy)
|
||||
|
||||
# The summed weighted loss
|
||||
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
|
||||
self.entropy * entropy_coeff)
|
||||
|
||||
# Optional additional KL Loss
|
||||
if use_kl_loss:
|
||||
self.total_loss += cur_kl_coeff * self.mean_kl
|
||||
|
||||
|
||||
class VTraceSurrogateLoss(object):
|
||||
def __init__(self,
|
||||
actions,
|
||||
prev_actions_logp,
|
||||
actions_logp,
|
||||
old_policy_actions_logp,
|
||||
action_kl,
|
||||
actions_entropy,
|
||||
dones,
|
||||
behaviour_logits,
|
||||
old_policy_behaviour_logits,
|
||||
target_logits,
|
||||
discount,
|
||||
rewards,
|
||||
values,
|
||||
bootstrap_value,
|
||||
dist_class,
|
||||
valid_mask,
|
||||
vf_loss_coeff=0.5,
|
||||
entropy_coeff=0.01,
|
||||
clip_rho_threshold=1.0,
|
||||
clip_pg_rho_threshold=1.0,
|
||||
clip_param=0.3,
|
||||
cur_kl_coeff=None,
|
||||
use_kl_loss=False):
|
||||
"""APPO Loss, with IS modifications and V-trace for Advantage Estimation
|
||||
|
||||
VTraceLoss takes tensors of shape [T, B, ...], where `B` is the
|
||||
batch_size. The reason we need to know `B` is for V-trace to properly
|
||||
handle episode cut boundaries.
|
||||
|
||||
Arguments:
|
||||
actions: An int|float32 tensor of shape [T, B, logit_dim].
|
||||
prev_actions_logp: A float32 tensor of shape [T, B].
|
||||
actions_logp: A float32 tensor of shape [T, B].
|
||||
old_policy_actions_logp: A float32 tensor of shape [T, B].
|
||||
action_kl: A float32 tensor of shape [T, B].
|
||||
actions_entropy: A float32 tensor of shape [T, B].
|
||||
dones: A bool tensor of shape [T, B].
|
||||
behaviour_logits: A float32 tensor of shape [T, B, logit_dim].
|
||||
old_policy_behaviour_logits: A float32 tensor of shape
|
||||
[T, B, logit_dim].
|
||||
target_logits: A float32 tensor of shape [T, B, logit_dim].
|
||||
discount: A float32 scalar.
|
||||
rewards: A float32 tensor of shape [T, B].
|
||||
values: A float32 tensor of shape [T, B].
|
||||
bootstrap_value: A float32 tensor of shape [B].
|
||||
dist_class: action distribution class for logits.
|
||||
valid_mask: A bool tensor of valid RNN input elements (#2992).
|
||||
vf_loss_coeff (float): Coefficient of the value function loss.
|
||||
entropy_coeff (float): Coefficient of the entropy regularizer.
|
||||
clip_param (float): Clip parameter.
|
||||
cur_kl_coeff (float): Coefficient for KL loss.
|
||||
use_kl_loss (bool): If true, use KL loss.
|
||||
"""
|
||||
|
||||
def reduce_mean_valid(t):
|
||||
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
|
||||
|
||||
# Compute vtrace on the CPU for better perf.
|
||||
with tf.device("/cpu:0"):
|
||||
self.vtrace_returns = vtrace.multi_from_logits(
|
||||
behaviour_policy_logits=behaviour_logits,
|
||||
target_policy_logits=old_policy_behaviour_logits,
|
||||
actions=tf.unstack(actions, axis=2),
|
||||
discounts=tf.to_float(~dones) * discount,
|
||||
rewards=rewards,
|
||||
values=values,
|
||||
bootstrap_value=bootstrap_value,
|
||||
dist_class=dist_class,
|
||||
clip_rho_threshold=tf.cast(clip_rho_threshold, tf.float32),
|
||||
clip_pg_rho_threshold=tf.cast(clip_pg_rho_threshold,
|
||||
tf.float32))
|
||||
|
||||
self.is_ratio = tf.clip_by_value(
|
||||
tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
|
||||
logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp)
|
||||
|
||||
advantages = self.vtrace_returns.pg_advantages
|
||||
surrogate_loss = tf.minimum(
|
||||
advantages * logp_ratio,
|
||||
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
|
||||
1 + clip_param))
|
||||
|
||||
self.mean_kl = reduce_mean_valid(action_kl)
|
||||
self.pi_loss = -reduce_mean_valid(surrogate_loss)
|
||||
|
||||
# The baseline loss
|
||||
delta = values - self.vtrace_returns.vs
|
||||
self.value_targets = self.vtrace_returns.vs
|
||||
self.vf_loss = 0.5 * reduce_mean_valid(tf.square(delta))
|
||||
|
||||
# The entropy loss
|
||||
self.entropy = reduce_mean_valid(actions_entropy)
|
||||
|
||||
# The summed weighted loss
|
||||
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff -
|
||||
self.entropy * entropy_coeff)
|
||||
|
||||
# Optional additional KL Loss
|
||||
if use_kl_loss:
|
||||
self.total_loss += cur_kl_coeff * self.mean_kl
|
||||
|
||||
|
||||
def build_appo_model(policy, obs_space, action_space, config):
|
||||
policy.model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
policy.logit_dim,
|
||||
config["model"],
|
||||
name=POLICY_SCOPE,
|
||||
framework="tf")
|
||||
|
||||
policy.target_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
policy.logit_dim,
|
||||
config["model"],
|
||||
name=TARGET_POLICY_SCOPE,
|
||||
framework="tf")
|
||||
|
||||
return policy.model
|
||||
|
||||
|
||||
def build_appo_surrogate_loss(policy, batch_tensors):
|
||||
if isinstance(policy.action_space, gym.spaces.Discrete):
|
||||
is_multidiscrete = False
|
||||
output_hidden_shape = [policy.action_space.n]
|
||||
elif isinstance(policy.action_space,
|
||||
gym.spaces.multi_discrete.MultiDiscrete):
|
||||
is_multidiscrete = True
|
||||
output_hidden_shape = policy.action_space.nvec.astype(np.int32)
|
||||
else:
|
||||
is_multidiscrete = False
|
||||
output_hidden_shape = 1
|
||||
|
||||
def make_time_major(*args, **kw):
|
||||
return _make_time_major(policy, *args, **kw)
|
||||
|
||||
actions = batch_tensors[SampleBatch.ACTIONS]
|
||||
dones = batch_tensors[SampleBatch.DONES]
|
||||
rewards = batch_tensors[SampleBatch.REWARDS]
|
||||
|
||||
behaviour_logits = batch_tensors[BEHAVIOUR_LOGITS]
|
||||
|
||||
policy.target_model_out, _ = policy.target_model(
|
||||
policy.input_dict, policy.state_in, policy.seq_lens)
|
||||
old_policy_behaviour_logits = tf.stop_gradient(policy.target_model_out)
|
||||
|
||||
unpacked_behaviour_logits = tf.split(
|
||||
behaviour_logits, output_hidden_shape, axis=1)
|
||||
unpacked_old_policy_behaviour_logits = tf.split(
|
||||
old_policy_behaviour_logits, output_hidden_shape, axis=1)
|
||||
unpacked_outputs = tf.split(policy.model_out, output_hidden_shape, axis=1)
|
||||
action_dist = policy.action_dist
|
||||
old_policy_action_dist = policy.dist_class(old_policy_behaviour_logits)
|
||||
prev_action_dist = policy.dist_class(behaviour_logits)
|
||||
values = policy.value_function
|
||||
|
||||
policy.model_vars = policy.model.variables()
|
||||
policy.target_model_vars = policy.target_model.variables()
|
||||
|
||||
if policy.state_in:
|
||||
max_seq_len = tf.reduce_max(policy.seq_lens) - 1
|
||||
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
|
||||
mask = tf.reshape(mask, [-1])
|
||||
else:
|
||||
mask = tf.ones_like(rewards)
|
||||
|
||||
if policy.config["vtrace"]:
|
||||
logger.info("Using V-Trace surrogate loss (vtrace=True)")
|
||||
|
||||
# Prepare actions for loss
|
||||
loss_actions = actions if is_multidiscrete else tf.expand_dims(
|
||||
actions, axis=1)
|
||||
|
||||
# Prepare KL for Loss
|
||||
mean_kl = make_time_major(
|
||||
old_policy_action_dist.multi_kl(action_dist), drop_last=True)
|
||||
|
||||
policy.loss = VTraceSurrogateLoss(
|
||||
actions=make_time_major(loss_actions, drop_last=True),
|
||||
prev_actions_logp=make_time_major(
|
||||
prev_action_dist.logp(actions), drop_last=True),
|
||||
actions_logp=make_time_major(
|
||||
action_dist.logp(actions), drop_last=True),
|
||||
old_policy_actions_logp=make_time_major(
|
||||
old_policy_action_dist.logp(actions), drop_last=True),
|
||||
action_kl=tf.reduce_mean(mean_kl, axis=0)
|
||||
if is_multidiscrete else mean_kl,
|
||||
actions_entropy=make_time_major(
|
||||
action_dist.multi_entropy(), drop_last=True),
|
||||
dones=make_time_major(dones, drop_last=True),
|
||||
behaviour_logits=make_time_major(
|
||||
unpacked_behaviour_logits, drop_last=True),
|
||||
old_policy_behaviour_logits=make_time_major(
|
||||
unpacked_old_policy_behaviour_logits, drop_last=True),
|
||||
target_logits=make_time_major(unpacked_outputs, drop_last=True),
|
||||
discount=policy.config["gamma"],
|
||||
rewards=make_time_major(rewards, drop_last=True),
|
||||
values=make_time_major(values, drop_last=True),
|
||||
bootstrap_value=make_time_major(values)[-1],
|
||||
dist_class=Categorical if is_multidiscrete else policy.dist_class,
|
||||
valid_mask=make_time_major(mask, drop_last=True),
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
entropy_coeff=policy.config["entropy_coeff"],
|
||||
clip_rho_threshold=policy.config["vtrace_clip_rho_threshold"],
|
||||
clip_pg_rho_threshold=policy.config[
|
||||
"vtrace_clip_pg_rho_threshold"],
|
||||
clip_param=policy.config["clip_param"],
|
||||
cur_kl_coeff=policy.kl_coeff,
|
||||
use_kl_loss=policy.config["use_kl_loss"])
|
||||
else:
|
||||
logger.info("Using PPO surrogate loss (vtrace=False)")
|
||||
|
||||
# Prepare KL for Loss
|
||||
mean_kl = make_time_major(prev_action_dist.multi_kl(action_dist))
|
||||
|
||||
policy.loss = PPOSurrogateLoss(
|
||||
prev_actions_logp=make_time_major(prev_action_dist.logp(actions)),
|
||||
actions_logp=make_time_major(action_dist.logp(actions)),
|
||||
action_kl=tf.reduce_mean(mean_kl, axis=0)
|
||||
if is_multidiscrete else mean_kl,
|
||||
actions_entropy=make_time_major(action_dist.multi_entropy()),
|
||||
values=make_time_major(values),
|
||||
valid_mask=make_time_major(mask),
|
||||
advantages=make_time_major(
|
||||
batch_tensors[Postprocessing.ADVANTAGES]),
|
||||
value_targets=make_time_major(
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS]),
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
entropy_coeff=policy.config["entropy_coeff"],
|
||||
clip_param=policy.config["clip_param"],
|
||||
cur_kl_coeff=policy.kl_coeff,
|
||||
use_kl_loss=policy.config["use_kl_loss"])
|
||||
|
||||
return policy.loss.total_loss
|
||||
|
||||
|
||||
def stats(policy, batch_tensors):
|
||||
values_batched = _make_time_major(
|
||||
policy, policy.value_function, drop_last=policy.config["vtrace"])
|
||||
|
||||
stats_dict = {
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"policy_loss": policy.loss.pi_loss,
|
||||
"entropy": policy.loss.entropy,
|
||||
"var_gnorm": tf.global_norm(policy.var_list),
|
||||
"vf_loss": policy.loss.vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
tf.reshape(policy.loss.value_targets, [-1]),
|
||||
tf.reshape(values_batched, [-1])),
|
||||
}
|
||||
|
||||
if policy.config["vtrace"]:
|
||||
is_stat_mean, is_stat_var = tf.nn.moments(policy.loss.is_ratio, [0, 1])
|
||||
stats_dict.update({"mean_IS": is_stat_mean})
|
||||
stats_dict.update({"var_IS": is_stat_var})
|
||||
|
||||
if policy.config["use_kl_loss"]:
|
||||
stats_dict.update({"kl": policy.loss.mean_kl})
|
||||
stats_dict.update({"KL_Coeff": policy.kl_coeff})
|
||||
|
||||
return stats_dict
|
||||
|
||||
|
||||
def postprocess_trajectory(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
if not policy.config["vtrace"]:
|
||||
completed = sample_batch["dones"][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
next_state = []
|
||||
for i in range(len(policy.state_in)):
|
||||
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
|
||||
last_r = policy.value(sample_batch["new_obs"][-1], *next_state)
|
||||
batch = compute_advantages(
|
||||
sample_batch,
|
||||
last_r,
|
||||
policy.config["gamma"],
|
||||
policy.config["lambda"],
|
||||
use_gae=policy.config["use_gae"])
|
||||
else:
|
||||
batch = sample_batch
|
||||
del batch.data["new_obs"] # not used, so save some bandwidth
|
||||
return batch
|
||||
|
||||
|
||||
def add_values_and_logits(policy):
|
||||
out = {BEHAVIOUR_LOGITS: policy.model_out}
|
||||
if not policy.config["vtrace"]:
|
||||
out[SampleBatch.VF_PREDS] = policy.value_function
|
||||
return out
|
||||
|
||||
|
||||
class TargetNetworkMixin(object):
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
"""Target Network is updated by the master learner every
|
||||
trainer.update_target_frequency steps. All worker batches
|
||||
are importance sampled w.r. to the target network to ensure
|
||||
a more stable pi_old in PPO.
|
||||
"""
|
||||
assign_ops = []
|
||||
assert len(self.model_vars) == len(self.target_model_vars)
|
||||
for var, var_target in zip(self.model_vars, self.target_model_vars):
|
||||
assign_ops.append(var_target.assign(var))
|
||||
self.update_target_network = tf.group(*assign_ops)
|
||||
|
||||
def update_target(self):
|
||||
return self.get_session().run(self.update_target_network)
|
||||
|
||||
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
KLCoeffMixin.__init__(policy, config)
|
||||
ValueNetworkMixin.__init__(policy)
|
||||
|
||||
|
||||
def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, obs_space, action_space, config)
|
||||
|
||||
|
||||
AsyncPPOTFPolicy = build_tf_policy(
|
||||
name="AsyncPPOTFPolicy",
|
||||
make_model=build_appo_model,
|
||||
loss_fn=build_appo_surrogate_loss,
|
||||
stats_fn=stats,
|
||||
postprocess_fn=postprocess_trajectory,
|
||||
optimizer_fn=choose_optimizer,
|
||||
gradients_fn=clip_gradients,
|
||||
extra_action_fetches_fn=add_values_and_logits,
|
||||
before_init=validate_config,
|
||||
before_loss_init=setup_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
mixins=[
|
||||
LearningRateSchedule, KLCoeffMixin, TargetNetworkMixin,
|
||||
ValueNetworkMixin
|
||||
],
|
||||
get_batch_divisibility_req=lambda p: p.config["sample_batch_size"])
|
||||
@@ -1,154 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
|
||||
from ray.rllib.agents import with_common_config
|
||||
from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# If true, use the Generalized Advantage Estimator (GAE)
|
||||
# with a value function, see https://arxiv.org/pdf/1506.02438.pdf.
|
||||
"use_gae": True,
|
||||
# GAE(lambda) parameter
|
||||
"lambda": 1.0,
|
||||
# Initial coefficient for KL divergence
|
||||
"kl_coeff": 0.2,
|
||||
# Size of batches collected from each worker
|
||||
"sample_batch_size": 200,
|
||||
# Number of timesteps collected for each SGD round
|
||||
"train_batch_size": 4000,
|
||||
# Total SGD batch size across all devices for SGD
|
||||
"sgd_minibatch_size": 128,
|
||||
# Whether to shuffle sequences in the batch when training (recommended)
|
||||
"shuffle_sequences": True,
|
||||
# Number of SGD iterations in each outer loop
|
||||
"num_sgd_iter": 30,
|
||||
# Stepsize of SGD
|
||||
"lr": 5e-5,
|
||||
# Learning rate schedule
|
||||
"lr_schedule": None,
|
||||
# Share layers for value function. If you set this to True, it's important
|
||||
# to tune vf_loss_coeff.
|
||||
"vf_share_layers": False,
|
||||
# Coefficient of the value function loss. It's important to tune this if
|
||||
# you set vf_share_layers: True
|
||||
"vf_loss_coeff": 1.0,
|
||||
# Coefficient of the entropy regularizer
|
||||
"entropy_coeff": 0.0,
|
||||
# Decay schedule for the entropy regularizer
|
||||
"entropy_coeff_schedule": None,
|
||||
# PPO clip parameter
|
||||
"clip_param": 0.3,
|
||||
# Clip param for the value function. Note that this is sensitive to the
|
||||
# scale of the rewards. If your expected V is large, increase this.
|
||||
"vf_clip_param": 10.0,
|
||||
# If specified, clip the global norm of gradients by this amount
|
||||
"grad_clip": None,
|
||||
# Target value for KL divergence
|
||||
"kl_target": 0.01,
|
||||
# Whether to rollout "complete_episodes" or "truncate_episodes"
|
||||
"batch_mode": "truncate_episodes",
|
||||
# Which observation filter to apply to the observation
|
||||
"observation_filter": "NoFilter",
|
||||
# Uses the sync samples optimizer instead of the multi-gpu one. This does
|
||||
# not support minibatches.
|
||||
"simple_optimizer": False,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def choose_policy_optimizer(workers, config):
|
||||
if config["simple_optimizer"]:
|
||||
return SyncSamplesOptimizer(
|
||||
workers,
|
||||
num_sgd_iter=config["num_sgd_iter"],
|
||||
train_batch_size=config["train_batch_size"])
|
||||
|
||||
return LocalMultiGPUOptimizer(
|
||||
workers,
|
||||
sgd_batch_size=config["sgd_minibatch_size"],
|
||||
num_sgd_iter=config["num_sgd_iter"],
|
||||
num_gpus=config["num_gpus"],
|
||||
sample_batch_size=config["sample_batch_size"],
|
||||
num_envs_per_worker=config["num_envs_per_worker"],
|
||||
train_batch_size=config["train_batch_size"],
|
||||
standardize_fields=["advantages"],
|
||||
shuffle_sequences=config["shuffle_sequences"])
|
||||
|
||||
|
||||
def update_kl(trainer, fetches):
|
||||
if "kl" in fetches:
|
||||
# single-agent
|
||||
trainer.workers.local_worker().for_policy(
|
||||
lambda pi: pi.update_kl(fetches["kl"]))
|
||||
else:
|
||||
|
||||
def update(pi, pi_id):
|
||||
if pi_id in fetches:
|
||||
pi.update_kl(fetches[pi_id]["kl"])
|
||||
else:
|
||||
logger.debug("No data for {}, not updating kl".format(pi_id))
|
||||
|
||||
# multi-agent
|
||||
trainer.workers.local_worker().foreach_trainable_policy(update)
|
||||
|
||||
|
||||
def warn_about_bad_reward_scales(trainer, result):
|
||||
# Warn about bad clipping configs
|
||||
if trainer.config["vf_clip_param"] <= 0:
|
||||
rew_scale = float("inf")
|
||||
elif result["policy_reward_mean"]:
|
||||
rew_scale = 0 # punt on handling multiagent case
|
||||
else:
|
||||
rew_scale = round(
|
||||
abs(result["episode_reward_mean"]) /
|
||||
trainer.config["vf_clip_param"], 0)
|
||||
if rew_scale > 200:
|
||||
logger.warning(
|
||||
"The magnitude of your environment rewards are more than "
|
||||
"{}x the scale of `vf_clip_param`. ".format(rew_scale) +
|
||||
"This means that it will take more than "
|
||||
"{} iterations for your value ".format(rew_scale) +
|
||||
"function to converge. If this is not intended, consider "
|
||||
"increasing `vf_clip_param`.")
|
||||
|
||||
|
||||
def validate_config(config):
|
||||
if config["entropy_coeff"] < 0:
|
||||
raise DeprecationWarning("entropy_coeff must be >= 0")
|
||||
if config["sgd_minibatch_size"] > config["train_batch_size"]:
|
||||
raise ValueError(
|
||||
"Minibatch size {} must be <= train batch size {}.".format(
|
||||
config["sgd_minibatch_size"], config["train_batch_size"]))
|
||||
if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
|
||||
raise ValueError(
|
||||
"Episode truncation is not supported without a value "
|
||||
"function. Consider setting batch_mode=complete_episodes.")
|
||||
if config["multiagent"]["policies"] and not config["simple_optimizer"]:
|
||||
logger.info(
|
||||
"In multi-agent mode, policies will be optimized sequentially "
|
||||
"by the multi-GPU optimizer. Consider setting "
|
||||
"simple_optimizer=True if this doesn't work for you.")
|
||||
if config["simple_optimizer"]:
|
||||
logger.warning(
|
||||
"Using the simple non-minibatch optimizer. This will greatly "
|
||||
"reduce performance, consider simple_optimizer=False.")
|
||||
|
||||
|
||||
PPOTrainer = build_trainer(
|
||||
name="PPO",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=PPOTFPolicy,
|
||||
make_policy_optimizer=choose_policy_optimizer,
|
||||
validate_config=validate_config,
|
||||
after_optimizer_step=update_kl,
|
||||
after_train_result=warn_about_bad_reward_scales)
|
||||
@@ -1,270 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
|
||||
EntropyCoeffSchedule
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Frozen logits of the policy that computed the action
|
||||
BEHAVIOUR_LOGITS = "behaviour_logits"
|
||||
|
||||
|
||||
class PPOLoss(object):
|
||||
def __init__(self,
|
||||
action_space,
|
||||
value_targets,
|
||||
advantages,
|
||||
actions,
|
||||
logits,
|
||||
vf_preds,
|
||||
curr_action_dist,
|
||||
value_fn,
|
||||
cur_kl_coeff,
|
||||
valid_mask,
|
||||
entropy_coeff=0,
|
||||
clip_param=0.1,
|
||||
vf_clip_param=0.1,
|
||||
vf_loss_coeff=1.0,
|
||||
use_gae=True):
|
||||
"""Constructs the loss for Proximal Policy Objective.
|
||||
|
||||
Arguments:
|
||||
action_space: Environment observation space specification.
|
||||
value_targets (Placeholder): Placeholder for target values; used
|
||||
for GAE.
|
||||
actions (Placeholder): Placeholder for actions taken
|
||||
from previous model evaluation.
|
||||
advantages (Placeholder): Placeholder for calculated advantages
|
||||
from previous model evaluation.
|
||||
logits (Placeholder): Placeholder for logits output from
|
||||
previous model evaluation.
|
||||
vf_preds (Placeholder): Placeholder for value function output
|
||||
from previous model evaluation.
|
||||
curr_action_dist (ActionDistribution): ActionDistribution
|
||||
of the current model.
|
||||
value_fn (Tensor): Current value function output Tensor.
|
||||
cur_kl_coeff (Variable): Variable holding the current PPO KL
|
||||
coefficient.
|
||||
valid_mask (Tensor): A bool mask of valid input elements (#2992).
|
||||
entropy_coeff (float): Coefficient of the entropy regularizer.
|
||||
clip_param (float): Clip parameter
|
||||
vf_clip_param (float): Clip parameter for the value function
|
||||
vf_loss_coeff (float): Coefficient of the value function loss
|
||||
use_gae (bool): If true, use the Generalized Advantage Estimator.
|
||||
"""
|
||||
|
||||
def reduce_mean_valid(t):
|
||||
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
|
||||
|
||||
dist_cls, _ = ModelCatalog.get_action_dist(action_space, {})
|
||||
prev_dist = dist_cls(logits)
|
||||
# Make loss functions.
|
||||
logp_ratio = tf.exp(
|
||||
curr_action_dist.logp(actions) - prev_dist.logp(actions))
|
||||
action_kl = prev_dist.kl(curr_action_dist)
|
||||
self.mean_kl = reduce_mean_valid(action_kl)
|
||||
|
||||
curr_entropy = curr_action_dist.entropy()
|
||||
self.mean_entropy = reduce_mean_valid(curr_entropy)
|
||||
|
||||
surrogate_loss = tf.minimum(
|
||||
advantages * logp_ratio,
|
||||
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
|
||||
1 + clip_param))
|
||||
self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
|
||||
|
||||
if use_gae:
|
||||
vf_loss1 = tf.square(value_fn - value_targets)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(
|
||||
value_fn - vf_preds, -vf_clip_param, vf_clip_param)
|
||||
vf_loss2 = tf.square(vf_clipped - value_targets)
|
||||
vf_loss = tf.maximum(vf_loss1, vf_loss2)
|
||||
self.mean_vf_loss = reduce_mean_valid(vf_loss)
|
||||
loss = reduce_mean_valid(
|
||||
-surrogate_loss + cur_kl_coeff * action_kl +
|
||||
vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy)
|
||||
else:
|
||||
self.mean_vf_loss = tf.constant(0.0)
|
||||
loss = reduce_mean_valid(-surrogate_loss +
|
||||
cur_kl_coeff * action_kl -
|
||||
entropy_coeff * curr_entropy)
|
||||
self.loss = loss
|
||||
|
||||
|
||||
def ppo_surrogate_loss(policy, batch_tensors):
|
||||
if policy.state_in:
|
||||
max_seq_len = tf.reduce_max(policy.seq_lens)
|
||||
mask = tf.sequence_mask(policy.seq_lens, max_seq_len)
|
||||
mask = tf.reshape(mask, [-1])
|
||||
else:
|
||||
mask = tf.ones_like(
|
||||
batch_tensors[Postprocessing.ADVANTAGES], dtype=tf.bool)
|
||||
|
||||
policy.loss_obj = PPOLoss(
|
||||
policy.action_space,
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS],
|
||||
batch_tensors[Postprocessing.ADVANTAGES],
|
||||
batch_tensors[SampleBatch.ACTIONS],
|
||||
batch_tensors[BEHAVIOUR_LOGITS],
|
||||
batch_tensors[SampleBatch.VF_PREDS],
|
||||
policy.action_dist,
|
||||
policy.value_function,
|
||||
policy.kl_coeff,
|
||||
mask,
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_param=policy.config["clip_param"],
|
||||
vf_clip_param=policy.config["vf_clip_param"],
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
use_gae=policy.config["use_gae"])
|
||||
|
||||
return policy.loss_obj.loss
|
||||
|
||||
|
||||
def kl_and_loss_stats(policy, batch_tensors):
|
||||
return {
|
||||
"cur_kl_coeff": tf.cast(policy.kl_coeff, tf.float64),
|
||||
"cur_lr": tf.cast(policy.cur_lr, tf.float64),
|
||||
"total_loss": policy.loss_obj.loss,
|
||||
"policy_loss": policy.loss_obj.mean_policy_loss,
|
||||
"vf_loss": policy.loss_obj.mean_vf_loss,
|
||||
"vf_explained_var": explained_variance(
|
||||
batch_tensors[Postprocessing.VALUE_TARGETS],
|
||||
policy.value_function),
|
||||
"kl": policy.loss_obj.mean_kl,
|
||||
"entropy": policy.loss_obj.mean_entropy,
|
||||
"entropy_coeff": tf.cast(policy.entropy_coeff, tf.float64),
|
||||
}
|
||||
|
||||
|
||||
def vf_preds_and_logits_fetches(policy):
|
||||
"""Adds value function and logits outputs to experience batches."""
|
||||
return {
|
||||
SampleBatch.VF_PREDS: policy.value_function,
|
||||
BEHAVIOUR_LOGITS: policy.model_out,
|
||||
}
|
||||
|
||||
|
||||
def postprocess_ppo_gae(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
"""Adds the policy logits, VF preds, and advantages to the trajectory."""
|
||||
|
||||
completed = sample_batch["dones"][-1]
|
||||
if completed:
|
||||
last_r = 0.0
|
||||
else:
|
||||
next_state = []
|
||||
for i in range(len(policy.state_in)):
|
||||
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
|
||||
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
|
||||
sample_batch[SampleBatch.ACTIONS][-1],
|
||||
sample_batch[SampleBatch.REWARDS][-1],
|
||||
*next_state)
|
||||
batch = compute_advantages(
|
||||
sample_batch,
|
||||
last_r,
|
||||
policy.config["gamma"],
|
||||
policy.config["lambda"],
|
||||
use_gae=policy.config["use_gae"])
|
||||
return batch
|
||||
|
||||
|
||||
def clip_gradients(policy, optimizer, loss):
|
||||
if policy.config["grad_clip"] is not None:
|
||||
policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
grads = tf.gradients(loss, policy.var_list)
|
||||
policy.grads, _ = tf.clip_by_global_norm(grads,
|
||||
policy.config["grad_clip"])
|
||||
clipped_grads = list(zip(policy.grads, policy.var_list))
|
||||
return clipped_grads
|
||||
else:
|
||||
return optimizer.compute_gradients(
|
||||
loss, colocate_gradients_with_ops=True)
|
||||
|
||||
|
||||
class KLCoeffMixin(object):
|
||||
def __init__(self, config):
|
||||
# KL Coefficient
|
||||
self.kl_coeff_val = config["kl_coeff"]
|
||||
self.kl_target = config["kl_target"]
|
||||
self.kl_coeff = tf.get_variable(
|
||||
initializer=tf.constant_initializer(self.kl_coeff_val),
|
||||
name="kl_coeff",
|
||||
shape=(),
|
||||
trainable=False,
|
||||
dtype=tf.float32)
|
||||
|
||||
def update_kl(self, sampled_kl):
|
||||
if sampled_kl > 2.0 * self.kl_target:
|
||||
self.kl_coeff_val *= 1.5
|
||||
elif sampled_kl < 0.5 * self.kl_target:
|
||||
self.kl_coeff_val *= 0.5
|
||||
self.kl_coeff.load(self.kl_coeff_val, session=self.get_session())
|
||||
return self.kl_coeff_val
|
||||
|
||||
|
||||
class ValueNetworkMixin(object):
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
if config["use_gae"]:
|
||||
self.value_function = self.model.value_function()
|
||||
else:
|
||||
self.value_function = tf.zeros(
|
||||
shape=tf.shape(self.get_placeholder(SampleBatch.CUR_OBS))[:1])
|
||||
|
||||
def _value(self, ob, prev_action, prev_reward, *args):
|
||||
feed_dict = {
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): [ob],
|
||||
self.get_placeholder(SampleBatch.PREV_ACTIONS): [prev_action],
|
||||
self.get_placeholder(SampleBatch.PREV_REWARDS): [prev_reward],
|
||||
self.seq_lens: [1]
|
||||
}
|
||||
assert len(args) == len(self.state_in), (args, self.state_in)
|
||||
for k, v in zip(self.state_in, args):
|
||||
feed_dict[k] = v
|
||||
vf = self.get_session().run(self.value_function, feed_dict)
|
||||
return vf[0]
|
||||
|
||||
|
||||
def setup_config(policy, obs_space, action_space, config):
|
||||
# auto set the model option for layer sharing
|
||||
config["model"]["vf_share_layers"] = config["vf_share_layers"]
|
||||
|
||||
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
|
||||
KLCoeffMixin.__init__(policy, config)
|
||||
EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
|
||||
config["entropy_coeff_schedule"])
|
||||
LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
|
||||
|
||||
|
||||
PPOTFPolicy = build_tf_policy(
|
||||
name="PPOTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
|
||||
loss_fn=ppo_surrogate_loss,
|
||||
stats_fn=kl_and_loss_stats,
|
||||
extra_action_fetches_fn=vf_preds_and_logits_fetches,
|
||||
postprocess_fn=postprocess_ppo_gae,
|
||||
gradients_fn=clip_gradients,
|
||||
before_init=setup_config,
|
||||
before_loss_init=setup_mixins,
|
||||
mixins=[
|
||||
LearningRateSchedule, EntropyCoeffSchedule, KLCoeffMixin,
|
||||
ValueNetworkMixin
|
||||
])
|
||||
@@ -1,64 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
import numpy as np
|
||||
from numpy.testing import assert_allclose
|
||||
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.agents.ppo.utils import flatten, concatenate
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
# TODO(ekl): move to rllib/models dir
|
||||
class DistributionsTest(unittest.TestCase):
|
||||
def testCategorical(self):
|
||||
num_samples = 100000
|
||||
logits = tf.placeholder(tf.float32, shape=(None, 10))
|
||||
z = 8 * (np.random.rand(10) - 0.5)
|
||||
data = np.tile(z, (num_samples, 1))
|
||||
c = Categorical(logits)
|
||||
sample_op = c.sample()
|
||||
sess = tf.Session()
|
||||
sess.run(tf.global_variables_initializer())
|
||||
samples = sess.run(sample_op, feed_dict={logits: data})
|
||||
counts = np.zeros(10)
|
||||
for sample in samples:
|
||||
counts[sample] += 1.0
|
||||
probs = np.exp(z) / np.sum(np.exp(z))
|
||||
self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
|
||||
|
||||
|
||||
class UtilsTest(unittest.TestCase):
|
||||
def testFlatten(self):
|
||||
d = {
|
||||
"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
|
||||
"a": np.array([[[5], [-5]], [[6], [-6]]])
|
||||
}
|
||||
flat = flatten(d.copy(), start=0, stop=2)
|
||||
assert_allclose(d["s"][0][0][:], flat["s"][0][:])
|
||||
assert_allclose(d["s"][0][1][:], flat["s"][1][:])
|
||||
assert_allclose(d["s"][1][0][:], flat["s"][2][:])
|
||||
assert_allclose(d["s"][1][1][:], flat["s"][3][:])
|
||||
assert_allclose(d["a"][0][0], flat["a"][0])
|
||||
assert_allclose(d["a"][0][1], flat["a"][1])
|
||||
assert_allclose(d["a"][1][0], flat["a"][2])
|
||||
assert_allclose(d["a"][1][1], flat["a"][3])
|
||||
|
||||
def testConcatenate(self):
|
||||
d1 = {"s": np.array([0, 1]), "a": np.array([2, 3])}
|
||||
d2 = {"s": np.array([4, 5]), "a": np.array([6, 7])}
|
||||
d = concatenate([d1, d2])
|
||||
assert_allclose(d["s"], np.array([0, 1, 4, 5]))
|
||||
assert_allclose(d["a"], np.array([2, 3, 6, 7]))
|
||||
|
||||
D = concatenate([d])
|
||||
assert_allclose(D["s"], np.array([0, 1, 4, 5]))
|
||||
assert_allclose(D["a"], np.array([2, 3, 6, 7]))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@@ -1,36 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def flatten(weights, start=0, stop=2):
|
||||
"""This methods reshapes all values in a dictionary.
|
||||
|
||||
The indices from start to stop will be flattened into a single index.
|
||||
|
||||
Args:
|
||||
weights: A dictionary mapping keys to numpy arrays.
|
||||
start: The starting index.
|
||||
stop: The ending index.
|
||||
"""
|
||||
for key, val in weights.items():
|
||||
new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
|
||||
weights[key] = val.reshape(new_shape)
|
||||
return weights
|
||||
|
||||
|
||||
def concatenate(weights_list):
|
||||
keys = weights_list[0].keys()
|
||||
result = {}
|
||||
for key in keys:
|
||||
result[key] = np.concatenate([l[key] for l in weights_list])
|
||||
return result
|
||||
|
||||
|
||||
def shuffle(trajectory):
|
||||
permutation = np.random.permutation(trajectory["actions"].shape[0])
|
||||
for key, val in trajectory.items():
|
||||
trajectory[key] = val[permutation]
|
||||
return trajectory
|
||||
@@ -1 +0,0 @@
|
||||
Code in this package is adapted from https://github.com/oxwhirl/pymarl.
|
||||
@@ -1,8 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.qmix.qmix import QMixTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.agents.qmix.apex import ApexQMixTrainer
|
||||
|
||||
__all__ = ["QMixTrainer", "ApexQMixTrainer", "DEFAULT_CONFIG"]
|
||||
@@ -1,39 +0,0 @@
|
||||
"""Experimental: scalable Ape-X variant of QMIX"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.dqn.apex import APEX_TRAINER_PROPERTIES
|
||||
from ray.rllib.agents.qmix.qmix import QMixTrainer, \
|
||||
DEFAULT_CONFIG as QMIX_CONFIG
|
||||
from ray.rllib.utils import merge_dicts
|
||||
|
||||
APEX_QMIX_DEFAULT_CONFIG = merge_dicts(
|
||||
QMIX_CONFIG, # see also the options in qmix.py, which are also supported
|
||||
{
|
||||
"optimizer": merge_dicts(
|
||||
QMIX_CONFIG["optimizer"],
|
||||
{
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"batch_replay": True, # required for RNN. Disables prio.
|
||||
"debug": False
|
||||
}),
|
||||
"num_gpus": 0,
|
||||
"num_workers": 32,
|
||||
"buffer_size": 2000000,
|
||||
"learning_starts": 50000,
|
||||
"train_batch_size": 512,
|
||||
"sample_batch_size": 50,
|
||||
"target_network_update_freq": 500000,
|
||||
"timesteps_per_iteration": 25000,
|
||||
"per_worker_exploration": True,
|
||||
"min_iter_time_s": 30,
|
||||
},
|
||||
)
|
||||
|
||||
ApexQMixTrainer = QMixTrainer.with_updates(
|
||||
name="APEX_QMIX",
|
||||
default_config=APEX_QMIX_DEFAULT_CONFIG,
|
||||
**APEX_TRAINER_PROPERTIES)
|
||||
@@ -1,64 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import numpy as np
|
||||
|
||||
|
||||
class VDNMixer(nn.Module):
|
||||
def __init__(self):
|
||||
super(VDNMixer, self).__init__()
|
||||
|
||||
def forward(self, agent_qs, batch):
|
||||
return th.sum(agent_qs, dim=2, keepdim=True)
|
||||
|
||||
|
||||
class QMixer(nn.Module):
|
||||
def __init__(self, n_agents, state_shape, mixing_embed_dim):
|
||||
super(QMixer, self).__init__()
|
||||
|
||||
self.n_agents = n_agents
|
||||
self.embed_dim = mixing_embed_dim
|
||||
self.state_dim = int(np.prod(state_shape))
|
||||
|
||||
self.hyper_w_1 = nn.Linear(self.state_dim,
|
||||
self.embed_dim * self.n_agents)
|
||||
self.hyper_w_final = nn.Linear(self.state_dim, self.embed_dim)
|
||||
|
||||
# State dependent bias for hidden layer
|
||||
self.hyper_b_1 = nn.Linear(self.state_dim, self.embed_dim)
|
||||
|
||||
# V(s) instead of a bias for the last layers
|
||||
self.V = nn.Sequential(
|
||||
nn.Linear(self.state_dim, self.embed_dim), nn.ReLU(),
|
||||
nn.Linear(self.embed_dim, 1))
|
||||
|
||||
def forward(self, agent_qs, states):
|
||||
"""Forward pass for the mixer.
|
||||
|
||||
Arguments:
|
||||
agent_qs: Tensor of shape [B, T, n_agents, n_actions]
|
||||
states: Tensor of shape [B, T, state_dim]
|
||||
"""
|
||||
bs = agent_qs.size(0)
|
||||
states = states.reshape(-1, self.state_dim)
|
||||
agent_qs = agent_qs.view(-1, 1, self.n_agents)
|
||||
# First layer
|
||||
w1 = th.abs(self.hyper_w_1(states))
|
||||
b1 = self.hyper_b_1(states)
|
||||
w1 = w1.view(-1, self.n_agents, self.embed_dim)
|
||||
b1 = b1.view(-1, 1, self.embed_dim)
|
||||
hidden = F.elu(th.bmm(agent_qs, w1) + b1)
|
||||
# Second layer
|
||||
w_final = th.abs(self.hyper_w_final(states))
|
||||
w_final = w_final.view(-1, self.embed_dim, 1)
|
||||
# State-dependent bias
|
||||
v = self.V(states).view(-1, 1, 1)
|
||||
# Compute final output
|
||||
y = th.bmm(hidden, w_final) + v
|
||||
# Reshape and return
|
||||
q_tot = y.view(bs, -1, 1)
|
||||
return q_tot
|
||||
@@ -1,42 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from torch import nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from ray.rllib.models.preprocessors import get_preprocessor
|
||||
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
|
||||
|
||||
class RNNModel(TorchModelV2, nn.Module):
|
||||
"""The default RNN model for QMIX."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
nn.Module.__init__(self)
|
||||
self.obs_size = _get_size(obs_space)
|
||||
self.rnn_hidden_dim = model_config["lstm_cell_size"]
|
||||
self.fc1 = nn.Linear(self.obs_size, self.rnn_hidden_dim)
|
||||
self.rnn = nn.GRUCell(self.rnn_hidden_dim, self.rnn_hidden_dim)
|
||||
self.fc2 = nn.Linear(self.rnn_hidden_dim, num_outputs)
|
||||
|
||||
@override(TorchModelV2)
|
||||
def get_initial_state(self):
|
||||
# make hidden states on same device as model
|
||||
return [self.fc1.weight.new(1, self.rnn_hidden_dim).zero_().squeeze(0)]
|
||||
|
||||
@override(TorchModelV2)
|
||||
def forward(self, input_dict, hidden_state, seq_lens):
|
||||
x = F.relu(self.fc1(input_dict["obs_flat"].float()))
|
||||
h_in = hidden_state[0].reshape(-1, self.rnn_hidden_dim)
|
||||
h = self.rnn(x, h_in)
|
||||
q = self.fc2(h)
|
||||
return q, [h]
|
||||
|
||||
|
||||
def _get_size(obs_space):
|
||||
return get_preprocessor(obs_space)(obs_space).size
|
||||
@@ -1,104 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
|
||||
from ray.rllib.agents.qmix.qmix_policy import QMixTorchPolicy
|
||||
from ray.rllib.optimizers import SyncBatchReplayOptimizer
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# === QMix ===
|
||||
# Mixing network. Either "qmix", "vdn", or None
|
||||
"mixer": "qmix",
|
||||
# Size of the mixing network embedding
|
||||
"mixing_embed_dim": 32,
|
||||
# Whether to use Double_Q learning
|
||||
"double_q": True,
|
||||
# Optimize over complete episodes by default.
|
||||
"batch_mode": "complete_episodes",
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
# Note that evaluation is currently not parallelized, and that for Ape-X
|
||||
# metrics are already only reported for the lowest epsilon workers.
|
||||
"evaluation_interval": None,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
# exploration_fraction
|
||||
"schedule_max_timesteps": 100000,
|
||||
# Number of env steps to optimize for before returning
|
||||
"timesteps_per_iteration": 1000,
|
||||
# Fraction of entire training period over which the exploration rate is
|
||||
# annealed
|
||||
"exploration_fraction": 0.1,
|
||||
# Final value of random action probability
|
||||
"exploration_final_eps": 0.02,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
"target_network_update_freq": 500,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer in steps.
|
||||
"buffer_size": 10000,
|
||||
|
||||
# === Optimization ===
|
||||
# Learning rate for adam optimizer
|
||||
"lr": 0.0005,
|
||||
# RMSProp alpha
|
||||
"optim_alpha": 0.99,
|
||||
# RMSProp epsilon
|
||||
"optim_eps": 0.00001,
|
||||
# If not None, clip gradients during optimization at this value
|
||||
"grad_norm_clipping": 10,
|
||||
# How many steps of the model to sample before learning starts.
|
||||
"learning_starts": 1000,
|
||||
# Update the replay buffer with this many samples at once. Note that
|
||||
# this setting applies per-worker if num_workers > 1.
|
||||
"sample_batch_size": 4,
|
||||
# Size of a batched sampled from replay buffer for training. Note that
|
||||
# if async_updates is set, then each worker returns gradients for a
|
||||
# batch of this size.
|
||||
"train_batch_size": 32,
|
||||
|
||||
# === Parallelism ===
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you"re using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
"per_worker_exploration": False,
|
||||
# Whether to compute priorities on workers.
|
||||
"worker_side_prioritization": False,
|
||||
# Prevent iterations from going lower than this time span
|
||||
"min_iter_time_s": 1,
|
||||
|
||||
# === Model ===
|
||||
"model": {
|
||||
"lstm_cell_size": 64,
|
||||
"max_seq_len": 999999,
|
||||
},
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
def make_sync_batch_optimizer(workers, config):
|
||||
return SyncBatchReplayOptimizer(
|
||||
workers,
|
||||
learning_starts=config["learning_starts"],
|
||||
buffer_size=config["buffer_size"],
|
||||
train_batch_size=config["train_batch_size"])
|
||||
|
||||
|
||||
QMixTrainer = GenericOffPolicyTrainer.with_updates(
|
||||
name="QMIX",
|
||||
default_config=DEFAULT_CONFIG,
|
||||
default_policy=QMixTorchPolicy,
|
||||
make_policy_optimizer=make_sync_batch_optimizer)
|
||||
@@ -1,450 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Tuple, Discrete, Dict
|
||||
import logging
|
||||
import numpy as np
|
||||
import torch as th
|
||||
import torch.nn as nn
|
||||
from torch.optim import RMSprop
|
||||
from torch.distributions import Categorical
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.qmix.mixers import VDNMixer, QMixer
|
||||
from ray.rllib.agents.qmix.model import RNNModel, _get_size
|
||||
from ray.rllib.evaluation.metrics import LEARNER_STATS_KEY
|
||||
from ray.rllib.policy.policy import Policy, TupleActions
|
||||
from ray.rllib.policy.rnn_sequencing import chop_into_sequences
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.models.model import _unpack_obs
|
||||
from ray.rllib.env.constants import GROUP_REWARDS
|
||||
from ray.rllib.utils.annotations import override
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class QMixLoss(nn.Module):
|
||||
def __init__(self,
|
||||
model,
|
||||
target_model,
|
||||
mixer,
|
||||
target_mixer,
|
||||
n_agents,
|
||||
n_actions,
|
||||
double_q=True,
|
||||
gamma=0.99):
|
||||
nn.Module.__init__(self)
|
||||
self.model = model
|
||||
self.target_model = target_model
|
||||
self.mixer = mixer
|
||||
self.target_mixer = target_mixer
|
||||
self.n_agents = n_agents
|
||||
self.n_actions = n_actions
|
||||
self.double_q = double_q
|
||||
self.gamma = gamma
|
||||
|
||||
def forward(self, rewards, actions, terminated, mask, obs, next_obs,
|
||||
action_mask, next_action_mask):
|
||||
"""Forward pass of the loss.
|
||||
|
||||
Arguments:
|
||||
rewards: Tensor of shape [B, T, n_agents]
|
||||
actions: Tensor of shape [B, T, n_agents]
|
||||
terminated: Tensor of shape [B, T, n_agents]
|
||||
mask: Tensor of shape [B, T, n_agents]
|
||||
obs: Tensor of shape [B, T, n_agents, obs_size]
|
||||
next_obs: Tensor of shape [B, T, n_agents, obs_size]
|
||||
action_mask: Tensor of shape [B, T, n_agents, n_actions]
|
||||
next_action_mask: Tensor of shape [B, T, n_agents, n_actions]
|
||||
"""
|
||||
|
||||
B, T = obs.size(0), obs.size(1)
|
||||
|
||||
# Calculate estimated Q-Values
|
||||
mac_out = []
|
||||
h = [
|
||||
s.expand([B, self.n_agents, -1])
|
||||
for s in self.model.get_initial_state()
|
||||
]
|
||||
for t in range(T):
|
||||
q, h = _mac(self.model, obs[:, t], h)
|
||||
mac_out.append(q)
|
||||
mac_out = th.stack(mac_out, dim=1) # Concat over time
|
||||
|
||||
# Pick the Q-Values for the actions taken -> [B * n_agents, T]
|
||||
chosen_action_qvals = th.gather(
|
||||
mac_out, dim=3, index=actions.unsqueeze(3)).squeeze(3)
|
||||
|
||||
# Calculate the Q-Values necessary for the target
|
||||
target_mac_out = []
|
||||
target_h = [
|
||||
s.expand([B, self.n_agents, -1])
|
||||
for s in self.target_model.get_initial_state()
|
||||
]
|
||||
for t in range(T):
|
||||
target_q, target_h = _mac(self.target_model, next_obs[:, t],
|
||||
target_h)
|
||||
target_mac_out.append(target_q)
|
||||
target_mac_out = th.stack(target_mac_out, dim=1) # Concat across time
|
||||
|
||||
# Mask out unavailable actions
|
||||
ignore_action = (next_action_mask == 0) & (mask == 1).unsqueeze(-1)
|
||||
target_mac_out[ignore_action] = -np.inf
|
||||
|
||||
# Max over target Q-Values
|
||||
if self.double_q:
|
||||
# Get actions that maximise live Q (for double q-learning)
|
||||
ignore_action = (action_mask == 0) & (mask == 1).unsqueeze(-1)
|
||||
mac_out = mac_out.clone() # issue 4742
|
||||
mac_out[ignore_action] = -np.inf
|
||||
cur_max_actions = mac_out.max(dim=3, keepdim=True)[1]
|
||||
target_max_qvals = th.gather(target_mac_out, 3,
|
||||
cur_max_actions).squeeze(3)
|
||||
else:
|
||||
target_max_qvals = target_mac_out.max(dim=3)[0]
|
||||
|
||||
assert target_max_qvals.min().item() != -np.inf, \
|
||||
"target_max_qvals contains a masked action; \
|
||||
there may be a state with no valid actions."
|
||||
|
||||
# Mix
|
||||
if self.mixer is not None:
|
||||
# TODO(ekl) add support for handling global state? This is just
|
||||
# treating the stacked agent obs as the state.
|
||||
chosen_action_qvals = self.mixer(chosen_action_qvals, obs)
|
||||
target_max_qvals = self.target_mixer(target_max_qvals, next_obs)
|
||||
|
||||
# Calculate 1-step Q-Learning targets
|
||||
targets = rewards + self.gamma * (1 - terminated) * target_max_qvals
|
||||
|
||||
# Td-error
|
||||
td_error = (chosen_action_qvals - targets.detach())
|
||||
|
||||
mask = mask.expand_as(td_error)
|
||||
|
||||
# 0-out the targets that came from padded data
|
||||
masked_td_error = td_error * mask
|
||||
|
||||
# Normal L2 loss, take mean over actual data
|
||||
loss = (masked_td_error**2).sum() / mask.sum()
|
||||
return loss, mask, masked_td_error, chosen_action_qvals, targets
|
||||
|
||||
|
||||
class QMixTorchPolicy(Policy):
|
||||
"""QMix impl. Assumes homogeneous agents for now.
|
||||
|
||||
You must use MultiAgentEnv.with_agent_groups() to group agents
|
||||
together for QMix. This creates the proper Tuple obs/action spaces and
|
||||
populates the '_group_rewards' info field.
|
||||
|
||||
Action masking: to specify an action mask for individual agents, use a
|
||||
dict space with an action_mask key, e.g. {"obs": ob, "action_mask": mask}.
|
||||
The mask space must be `Box(0, 1, (n_actions,))`.
|
||||
"""
|
||||
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
_validate(obs_space, action_space)
|
||||
config = dict(ray.rllib.agents.qmix.qmix.DEFAULT_CONFIG, **config)
|
||||
self.config = config
|
||||
self.observation_space = obs_space
|
||||
self.action_space = action_space
|
||||
self.n_agents = len(obs_space.original_space.spaces)
|
||||
self.n_actions = action_space.spaces[0].n
|
||||
self.h_size = config["model"]["lstm_cell_size"]
|
||||
|
||||
agent_obs_space = obs_space.original_space.spaces[0]
|
||||
if isinstance(agent_obs_space, Dict):
|
||||
space_keys = set(agent_obs_space.spaces.keys())
|
||||
if space_keys != {"obs", "action_mask"}:
|
||||
raise ValueError(
|
||||
"Dict obs space for agent must have keyset "
|
||||
"['obs', 'action_mask'], got {}".format(space_keys))
|
||||
mask_shape = tuple(agent_obs_space.spaces["action_mask"].shape)
|
||||
if mask_shape != (self.n_actions, ):
|
||||
raise ValueError("Action mask shape must be {}, got {}".format(
|
||||
(self.n_actions, ), mask_shape))
|
||||
self.has_action_mask = True
|
||||
self.obs_size = _get_size(agent_obs_space.spaces["obs"])
|
||||
# The real agent obs space is nested inside the dict
|
||||
agent_obs_space = agent_obs_space.spaces["obs"]
|
||||
else:
|
||||
self.has_action_mask = False
|
||||
self.obs_size = _get_size(agent_obs_space)
|
||||
|
||||
self.model = ModelCatalog.get_model_v2(
|
||||
agent_obs_space,
|
||||
action_space.spaces[0],
|
||||
self.n_actions,
|
||||
config["model"],
|
||||
framework="torch",
|
||||
name="model",
|
||||
default_model=RNNModel)
|
||||
|
||||
self.target_model = ModelCatalog.get_model_v2(
|
||||
agent_obs_space,
|
||||
action_space.spaces[0],
|
||||
self.n_actions,
|
||||
config["model"],
|
||||
framework="torch",
|
||||
name="target_model",
|
||||
default_model=RNNModel)
|
||||
|
||||
# Setup the mixer network.
|
||||
# The global state is just the stacked agent observations for now.
|
||||
self.state_shape = [self.obs_size, self.n_agents]
|
||||
if config["mixer"] is None:
|
||||
self.mixer = None
|
||||
self.target_mixer = None
|
||||
elif config["mixer"] == "qmix":
|
||||
self.mixer = QMixer(self.n_agents, self.state_shape,
|
||||
config["mixing_embed_dim"])
|
||||
self.target_mixer = QMixer(self.n_agents, self.state_shape,
|
||||
config["mixing_embed_dim"])
|
||||
elif config["mixer"] == "vdn":
|
||||
self.mixer = VDNMixer()
|
||||
self.target_mixer = VDNMixer()
|
||||
else:
|
||||
raise ValueError("Unknown mixer type {}".format(config["mixer"]))
|
||||
|
||||
self.cur_epsilon = 1.0
|
||||
self.update_target() # initial sync
|
||||
|
||||
# Setup optimizer
|
||||
self.params = list(self.model.parameters())
|
||||
if self.mixer:
|
||||
self.params += list(self.mixer.parameters())
|
||||
self.loss = QMixLoss(self.model, self.target_model, self.mixer,
|
||||
self.target_mixer, self.n_agents, self.n_actions,
|
||||
self.config["double_q"], self.config["gamma"])
|
||||
self.optimiser = RMSprop(
|
||||
params=self.params,
|
||||
lr=config["lr"],
|
||||
alpha=config["optim_alpha"],
|
||||
eps=config["optim_eps"])
|
||||
|
||||
@override(Policy)
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
info_batch=None,
|
||||
episodes=None,
|
||||
**kwargs):
|
||||
obs_batch, action_mask = self._unpack_observation(obs_batch)
|
||||
|
||||
# Compute actions
|
||||
with th.no_grad():
|
||||
q_values, hiddens = _mac(
|
||||
self.model, th.from_numpy(obs_batch),
|
||||
[th.from_numpy(np.array(s)) for s in state_batches])
|
||||
avail = th.from_numpy(action_mask).float()
|
||||
masked_q_values = q_values.clone()
|
||||
masked_q_values[avail == 0.0] = -float("inf")
|
||||
# epsilon-greedy action selector
|
||||
random_numbers = th.rand_like(q_values[:, :, 0])
|
||||
pick_random = (random_numbers < self.cur_epsilon).long()
|
||||
random_actions = Categorical(avail).sample().long()
|
||||
actions = (pick_random * random_actions +
|
||||
(1 - pick_random) * masked_q_values.max(dim=2)[1])
|
||||
actions = actions.numpy()
|
||||
hiddens = [s.numpy() for s in hiddens]
|
||||
|
||||
return TupleActions(list(actions.transpose([1, 0]))), hiddens, {}
|
||||
|
||||
@override(Policy)
|
||||
def learn_on_batch(self, samples):
|
||||
obs_batch, action_mask = self._unpack_observation(
|
||||
samples[SampleBatch.CUR_OBS])
|
||||
next_obs_batch, next_action_mask = self._unpack_observation(
|
||||
samples[SampleBatch.NEXT_OBS])
|
||||
group_rewards = self._get_group_rewards(samples[SampleBatch.INFOS])
|
||||
|
||||
# These will be padded to shape [B * T, ...]
|
||||
[rew, action_mask, next_action_mask, act, dones, obs, next_obs], \
|
||||
initial_states, seq_lens = \
|
||||
chop_into_sequences(
|
||||
samples[SampleBatch.EPS_ID],
|
||||
samples[SampleBatch.UNROLL_ID],
|
||||
samples[SampleBatch.AGENT_INDEX], [
|
||||
group_rewards, action_mask, next_action_mask,
|
||||
samples[SampleBatch.ACTIONS], samples[SampleBatch.DONES],
|
||||
obs_batch, next_obs_batch
|
||||
],
|
||||
[samples["state_in_{}".format(k)]
|
||||
for k in range(len(self.get_initial_state()))],
|
||||
max_seq_len=self.config["model"]["max_seq_len"],
|
||||
dynamic_max=True)
|
||||
B, T = len(seq_lens), max(seq_lens)
|
||||
|
||||
def to_batches(arr):
|
||||
new_shape = [B, T] + list(arr.shape[1:])
|
||||
return th.from_numpy(np.reshape(arr, new_shape))
|
||||
|
||||
rewards = to_batches(rew).float()
|
||||
actions = to_batches(act).long()
|
||||
obs = to_batches(obs).reshape([B, T, self.n_agents,
|
||||
self.obs_size]).float()
|
||||
action_mask = to_batches(action_mask)
|
||||
next_obs = to_batches(next_obs).reshape(
|
||||
[B, T, self.n_agents, self.obs_size]).float()
|
||||
next_action_mask = to_batches(next_action_mask)
|
||||
|
||||
# TODO(ekl) this treats group termination as individual termination
|
||||
terminated = to_batches(dones.astype(np.float32)).unsqueeze(2).expand(
|
||||
B, T, self.n_agents)
|
||||
|
||||
# Create mask for where index is < unpadded sequence length
|
||||
filled = (np.reshape(np.tile(np.arange(T), B), [B, T]) <
|
||||
np.expand_dims(seq_lens, 1)).astype(np.float32)
|
||||
mask = th.from_numpy(filled).unsqueeze(2).expand(B, T, self.n_agents)
|
||||
|
||||
# Compute loss
|
||||
loss_out, mask, masked_td_error, chosen_action_qvals, targets = \
|
||||
self.loss(rewards, actions, terminated, mask, obs,
|
||||
next_obs, action_mask, next_action_mask)
|
||||
|
||||
# Optimise
|
||||
self.optimiser.zero_grad()
|
||||
loss_out.backward()
|
||||
grad_norm = th.nn.utils.clip_grad_norm_(
|
||||
self.params, self.config["grad_norm_clipping"])
|
||||
self.optimiser.step()
|
||||
|
||||
mask_elems = mask.sum().item()
|
||||
stats = {
|
||||
"loss": loss_out.item(),
|
||||
"grad_norm": grad_norm
|
||||
if isinstance(grad_norm, float) else grad_norm.item(),
|
||||
"td_error_abs": masked_td_error.abs().sum().item() / mask_elems,
|
||||
"q_taken_mean": (chosen_action_qvals * mask).sum().item() /
|
||||
mask_elems,
|
||||
"target_mean": (targets * mask).sum().item() / mask_elems,
|
||||
}
|
||||
return {LEARNER_STATS_KEY: stats}
|
||||
|
||||
@override(Policy)
|
||||
def get_initial_state(self):
|
||||
return [
|
||||
s.expand([self.n_agents, -1]).numpy()
|
||||
for s in self.model.get_initial_state()
|
||||
]
|
||||
|
||||
@override(Policy)
|
||||
def get_weights(self):
|
||||
return {"model": self.model.state_dict()}
|
||||
|
||||
@override(Policy)
|
||||
def set_weights(self, weights):
|
||||
self.model.load_state_dict(weights["model"])
|
||||
|
||||
@override(Policy)
|
||||
def get_state(self):
|
||||
return {
|
||||
"model": self.model.state_dict(),
|
||||
"target_model": self.target_model.state_dict(),
|
||||
"mixer": self.mixer.state_dict() if self.mixer else None,
|
||||
"target_mixer": self.target_mixer.state_dict()
|
||||
if self.mixer else None,
|
||||
"cur_epsilon": self.cur_epsilon,
|
||||
}
|
||||
|
||||
@override(Policy)
|
||||
def set_state(self, state):
|
||||
self.model.load_state_dict(state["model"])
|
||||
self.target_model.load_state_dict(state["target_model"])
|
||||
if state["mixer"] is not None:
|
||||
self.mixer.load_state_dict(state["mixer"])
|
||||
self.target_mixer.load_state_dict(state["target_mixer"])
|
||||
self.set_epsilon(state["cur_epsilon"])
|
||||
self.update_target()
|
||||
|
||||
def update_target(self):
|
||||
self.target_model.load_state_dict(self.model.state_dict())
|
||||
if self.mixer is not None:
|
||||
self.target_mixer.load_state_dict(self.mixer.state_dict())
|
||||
logger.debug("Updated target networks")
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
self.cur_epsilon = epsilon
|
||||
|
||||
def _get_group_rewards(self, info_batch):
|
||||
group_rewards = np.array([
|
||||
info.get(GROUP_REWARDS, [0.0] * self.n_agents)
|
||||
for info in info_batch
|
||||
])
|
||||
return group_rewards
|
||||
|
||||
def _unpack_observation(self, obs_batch):
|
||||
"""Unpacks the action mask / tuple obs from agent grouping.
|
||||
|
||||
Returns:
|
||||
obs (Tensor): flattened obs tensor of shape [B, n_agents, obs_size]
|
||||
mask (Tensor): action mask, if any
|
||||
"""
|
||||
unpacked = _unpack_obs(
|
||||
np.array(obs_batch),
|
||||
self.observation_space.original_space,
|
||||
tensorlib=np)
|
||||
if self.has_action_mask:
|
||||
obs = np.concatenate(
|
||||
[o["obs"] for o in unpacked],
|
||||
axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size])
|
||||
action_mask = np.concatenate(
|
||||
[o["action_mask"] for o in unpacked], axis=1).reshape(
|
||||
[len(obs_batch), self.n_agents, self.n_actions])
|
||||
else:
|
||||
obs = np.concatenate(
|
||||
unpacked,
|
||||
axis=1).reshape([len(obs_batch), self.n_agents, self.obs_size])
|
||||
action_mask = np.ones(
|
||||
[len(obs_batch), self.n_agents, self.n_actions])
|
||||
return obs, action_mask
|
||||
|
||||
|
||||
def _validate(obs_space, action_space):
|
||||
if not hasattr(obs_space, "original_space") or \
|
||||
not isinstance(obs_space.original_space, Tuple):
|
||||
raise ValueError("Obs space must be a Tuple, got {}. Use ".format(
|
||||
obs_space) + "MultiAgentEnv.with_agent_groups() to group related "
|
||||
"agents for QMix.")
|
||||
if not isinstance(action_space, Tuple):
|
||||
raise ValueError(
|
||||
"Action space must be a Tuple, got {}. ".format(action_space) +
|
||||
"Use MultiAgentEnv.with_agent_groups() to group related "
|
||||
"agents for QMix.")
|
||||
if not isinstance(action_space.spaces[0], Discrete):
|
||||
raise ValueError(
|
||||
"QMix requires a discrete action space, got {}".format(
|
||||
action_space.spaces[0]))
|
||||
if len({str(x) for x in obs_space.original_space.spaces}) > 1:
|
||||
raise ValueError(
|
||||
"Implementation limitation: observations of grouped agents "
|
||||
"must be homogeneous, got {}".format(
|
||||
obs_space.original_space.spaces))
|
||||
if len({str(x) for x in action_space.spaces}) > 1:
|
||||
raise ValueError(
|
||||
"Implementation limitation: action space of grouped agents "
|
||||
"must be homogeneous, got {}".format(action_space.spaces))
|
||||
|
||||
|
||||
def _mac(model, obs, h):
|
||||
"""Forward pass of the multi-agent controller.
|
||||
|
||||
Arguments:
|
||||
model: TorchModelV2 class
|
||||
obs: Tensor of shape [B, n_agents, obs_size]
|
||||
h: List of tensors of shape [B, n_agents, h_size]
|
||||
|
||||
Returns:
|
||||
q_vals: Tensor of shape [B, n_agents, n_actions]
|
||||
h: Tensor of shape [B, n_agents, h_size]
|
||||
"""
|
||||
B, n_agents = obs.size(0), obs.size(1)
|
||||
obs_flat = obs.reshape([B * n_agents, -1])
|
||||
h_flat = [s.reshape([B * n_agents, -1]) for s in h]
|
||||
q_flat, h_flat = model({"obs": obs_flat}, h_flat, None)
|
||||
return q_flat.reshape(
|
||||
[B, n_agents, -1]), [s.reshape([B, n_agents, -1]) for s in h_flat]
|
||||
@@ -1,152 +0,0 @@
|
||||
"""Registry of algorithm names for `rllib train --run=<alg_name>`"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import traceback
|
||||
|
||||
from ray.rllib.contrib.registry import CONTRIBUTED_ALGORITHMS
|
||||
|
||||
|
||||
def _import_sac():
|
||||
from ray.rllib.agents import sac
|
||||
return sac.SACTrainer
|
||||
|
||||
|
||||
def _import_appo():
|
||||
from ray.rllib.agents import ppo
|
||||
return ppo.APPOTrainer
|
||||
|
||||
|
||||
def _import_qmix():
|
||||
from ray.rllib.agents import qmix
|
||||
return qmix.QMixTrainer
|
||||
|
||||
|
||||
def _import_apex_qmix():
|
||||
from ray.rllib.agents import qmix
|
||||
return qmix.ApexQMixTrainer
|
||||
|
||||
|
||||
def _import_ddpg():
|
||||
from ray.rllib.agents import ddpg
|
||||
return ddpg.DDPGTrainer
|
||||
|
||||
|
||||
def _import_apex_ddpg():
|
||||
from ray.rllib.agents import ddpg
|
||||
return ddpg.ApexDDPGTrainer
|
||||
|
||||
|
||||
def _import_td3():
|
||||
from ray.rllib.agents import ddpg
|
||||
return ddpg.TD3Trainer
|
||||
|
||||
|
||||
def _import_ppo():
|
||||
from ray.rllib.agents import ppo
|
||||
return ppo.PPOTrainer
|
||||
|
||||
|
||||
def _import_es():
|
||||
from ray.rllib.agents import es
|
||||
return es.ESTrainer
|
||||
|
||||
|
||||
def _import_ars():
|
||||
from ray.rllib.agents import ars
|
||||
return ars.ARSTrainer
|
||||
|
||||
|
||||
def _import_dqn():
|
||||
from ray.rllib.agents import dqn
|
||||
return dqn.DQNTrainer
|
||||
|
||||
|
||||
def _import_simple_q():
|
||||
from ray.rllib.agents import dqn
|
||||
return dqn.SimpleQTrainer
|
||||
|
||||
|
||||
def _import_apex():
|
||||
from ray.rllib.agents import dqn
|
||||
return dqn.ApexTrainer
|
||||
|
||||
|
||||
def _import_a3c():
|
||||
from ray.rllib.agents import a3c
|
||||
return a3c.A3CTrainer
|
||||
|
||||
|
||||
def _import_a2c():
|
||||
from ray.rllib.agents import a3c
|
||||
return a3c.A2CTrainer
|
||||
|
||||
|
||||
def _import_pg():
|
||||
from ray.rllib.agents import pg
|
||||
return pg.PGTrainer
|
||||
|
||||
|
||||
def _import_impala():
|
||||
from ray.rllib.agents import impala
|
||||
return impala.ImpalaTrainer
|
||||
|
||||
|
||||
def _import_marwil():
|
||||
from ray.rllib.agents import marwil
|
||||
return marwil.MARWILTrainer
|
||||
|
||||
|
||||
ALGORITHMS = {
|
||||
"SAC": _import_sac,
|
||||
"DDPG": _import_ddpg,
|
||||
"APEX_DDPG": _import_apex_ddpg,
|
||||
"TD3": _import_td3,
|
||||
"PPO": _import_ppo,
|
||||
"ES": _import_es,
|
||||
"ARS": _import_ars,
|
||||
"DQN": _import_dqn,
|
||||
"SimpleQ": _import_simple_q,
|
||||
"APEX": _import_apex,
|
||||
"A3C": _import_a3c,
|
||||
"A2C": _import_a2c,
|
||||
"PG": _import_pg,
|
||||
"IMPALA": _import_impala,
|
||||
"QMIX": _import_qmix,
|
||||
"APEX_QMIX": _import_apex_qmix,
|
||||
"APPO": _import_appo,
|
||||
"MARWIL": _import_marwil,
|
||||
}
|
||||
|
||||
|
||||
def get_agent_class(alg):
|
||||
"""Returns the class of a known agent given its name."""
|
||||
|
||||
try:
|
||||
return _get_agent_class(alg)
|
||||
except ImportError:
|
||||
from ray.rllib.agents.mock import _agent_import_failed
|
||||
return _agent_import_failed(traceback.format_exc())
|
||||
|
||||
|
||||
def _get_agent_class(alg):
|
||||
if alg in ALGORITHMS:
|
||||
return ALGORITHMS[alg]()
|
||||
elif alg in CONTRIBUTED_ALGORITHMS:
|
||||
return CONTRIBUTED_ALGORITHMS[alg]()
|
||||
elif alg == "script":
|
||||
from ray.tune import script_runner
|
||||
return script_runner.ScriptRunner
|
||||
elif alg == "__fake":
|
||||
from ray.rllib.agents.mock import _MockTrainer
|
||||
return _MockTrainer
|
||||
elif alg == "__sigmoid_fake_data":
|
||||
from ray.rllib.agents.mock import _SigmoidFakeData
|
||||
return _SigmoidFakeData
|
||||
elif alg == "__parameter_tuning":
|
||||
from ray.rllib.agents.mock import _ParameterTuningTrainer
|
||||
return _ParameterTuningTrainer
|
||||
else:
|
||||
raise Exception(("Unknown algorithm {}.").format(alg))
|
||||
@@ -1 +0,0 @@
|
||||
Implementation of Soft Actor-Critic (https://arxiv.org/abs/1812.05905.pdf).
|
||||
@@ -1,13 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.sac.sac import SACTrainer, DEFAULT_CONFIG
|
||||
from ray.rllib.utils import renamed_agent
|
||||
|
||||
SACAgent = renamed_agent(SACTrainer)
|
||||
|
||||
__all__ = [
|
||||
"SACTrainer",
|
||||
"DEFAULT_CONFIG",
|
||||
]
|
||||
@@ -1,119 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.agents.trainer import with_common_config
|
||||
from ray.rllib.agents.dqn.dqn import GenericOffPolicyTrainer
|
||||
from ray.rllib.agents.sac.sac_policy import SACTFPolicy
|
||||
|
||||
OPTIMIZER_SHARED_CONFIGS = [
|
||||
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
|
||||
"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
|
||||
"train_batch_size", "learning_starts"
|
||||
]
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# === Model ===
|
||||
"twin_q": True,
|
||||
"use_state_preprocessor": False,
|
||||
"policy": "GaussianLatentSpacePolicy",
|
||||
# RLlib model options for the Q function
|
||||
"Q_model": {
|
||||
"hidden_activation": "relu",
|
||||
"hidden_layer_sizes": (256, 256),
|
||||
},
|
||||
# RLlib model options for the policy function
|
||||
"policy_model": {
|
||||
"hidden_activation": "relu",
|
||||
"hidden_layer_sizes": (256, 256),
|
||||
},
|
||||
|
||||
# === Learning ===
|
||||
# Update the target by \tau * policy + (1-\tau) * target_policy
|
||||
"tau": 5e-3,
|
||||
# Target entropy lower bound. This is the inverse of reward scale,
|
||||
# and will be optimized automatically.
|
||||
"target_entropy": "auto",
|
||||
# Disable setting done=True at end of episode.
|
||||
"no_done_at_end": True,
|
||||
# N-step target updates
|
||||
"n_step": 1,
|
||||
|
||||
# === Evaluation ===
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
"evaluation_interval": 1,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 1,
|
||||
# Extra configuration that disables exploration.
|
||||
"evaluation_config": {
|
||||
"exploration_enabled": False,
|
||||
},
|
||||
|
||||
# === Exploration ===
|
||||
# Number of env steps to optimize for before returning
|
||||
"timesteps_per_iteration": 1000,
|
||||
"exploration_enabled": True,
|
||||
|
||||
# === Replay buffer ===
|
||||
# Size of the replay buffer. Note that if async_updates is set, then
|
||||
# each worker will have a replay buffer of this size.
|
||||
"buffer_size": int(1e6),
|
||||
# If True prioritized replay buffer will be used.
|
||||
# TODO(hartikainen): Make sure this works or remove the option.
|
||||
"prioritized_replay": False,
|
||||
"prioritized_replay_alpha": 0.6,
|
||||
"prioritized_replay_beta": 0.4,
|
||||
"prioritized_replay_eps": 1e-6,
|
||||
"beta_annealing_fraction": 0.2,
|
||||
"final_prioritized_replay_beta": 0.4,
|
||||
"compress_observations": False,
|
||||
|
||||
# === Optimization ===
|
||||
"optimization": {
|
||||
"actor_learning_rate": 3e-4,
|
||||
"critic_learning_rate": 3e-4,
|
||||
"entropy_learning_rate": 3e-4,
|
||||
},
|
||||
# If not None, clip gradients during optimization at this value
|
||||
"grad_norm_clipping": None,
|
||||
# How many steps of the model to sample before learning starts.
|
||||
"learning_starts": 1500,
|
||||
# Update the replay buffer with this many samples at once. Note that this
|
||||
# setting applies per-worker if num_workers > 1.
|
||||
"sample_batch_size": 1,
|
||||
# Size of a batched sampled from replay buffer for training. Note that
|
||||
# if async_updates is set, then each worker returns gradients for a
|
||||
# batch of this size.
|
||||
"train_batch_size": 256,
|
||||
# Update the target network every `target_network_update_freq` steps.
|
||||
"target_network_update_freq": 0,
|
||||
|
||||
# === Parallelism ===
|
||||
# Whether to use a GPU for local optimization.
|
||||
"num_gpus": 0,
|
||||
# Number of workers for collecting samples with. This only makes sense
|
||||
# to increase if your environment is particularly slow to sample, or if
|
||||
# you"re using the Async or Ape-X optimizers.
|
||||
"num_workers": 0,
|
||||
# Whether to allocate GPUs for workers (if > 0).
|
||||
"num_gpus_per_worker": 0,
|
||||
# Whether to allocate CPUs for workers (if > 0).
|
||||
"num_cpus_per_worker": 1,
|
||||
# Whether to compute priorities on workers.
|
||||
"worker_side_prioritization": False,
|
||||
# Prevent iterations from going lower than this time span
|
||||
"min_iter_time_s": 1,
|
||||
|
||||
# TODO(ekl) these are unused; remove them from sac config
|
||||
"per_worker_exploration": False,
|
||||
"exploration_fraction": 0.1,
|
||||
"schedule_max_timesteps": 100000,
|
||||
"exploration_final_eps": 0.02,
|
||||
})
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
SACTrainer = GenericOffPolicyTrainer.with_updates(
|
||||
name="SAC", default_config=DEFAULT_CONFIG, default_policy=SACTFPolicy)
|
||||
@@ -1,232 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils import try_import_tf, try_import_tfp
|
||||
|
||||
tf = try_import_tf()
|
||||
tfp = try_import_tfp()
|
||||
|
||||
SCALE_DIAG_MIN_MAX = (-20, 2)
|
||||
|
||||
|
||||
def SquashBijector():
|
||||
# lazy def since it depends on tfp
|
||||
class SquashBijector(tfp.bijectors.Bijector):
|
||||
def __init__(self, validate_args=False, name="tanh"):
|
||||
super(SquashBijector, self).__init__(
|
||||
forward_min_event_ndims=0,
|
||||
validate_args=validate_args,
|
||||
name=name)
|
||||
|
||||
def _forward(self, x):
|
||||
return tf.nn.tanh(x)
|
||||
|
||||
def _inverse(self, y):
|
||||
return tf.atanh(y)
|
||||
|
||||
def _forward_log_det_jacobian(self, x):
|
||||
return 2. * (np.log(2.) - x - tf.nn.softplus(-2. * x))
|
||||
|
||||
return SquashBijector()
|
||||
|
||||
|
||||
class SACModel(TFModelV2):
|
||||
"""Extension of standard TFModel for SAC.
|
||||
|
||||
Data flow:
|
||||
obs -> forward() -> model_out
|
||||
model_out -> get_policy_output() -> pi(s)
|
||||
model_out, actions -> get_q_values() -> Q(s, a)
|
||||
model_out, actions -> get_twin_q_values() -> Q_twin(s, a)
|
||||
|
||||
Note that this class by itself is not a valid model unless you
|
||||
implement forward() in a subclass."""
|
||||
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
actor_hidden_activation="relu",
|
||||
actor_hiddens=(256, 256),
|
||||
critic_hidden_activation="relu",
|
||||
critic_hiddens=(256, 256),
|
||||
twin_q=False):
|
||||
"""Initialize variables of this model.
|
||||
|
||||
Extra model kwargs:
|
||||
actor_hidden_activation (str): activation for actor network
|
||||
actor_hiddens (list): hidden layers sizes for actor network
|
||||
critic_hidden_activation (str): activation for critic network
|
||||
critic_hiddens (list): hidden layers sizes for critic network
|
||||
twin_q (bool): build twin Q networks
|
||||
|
||||
Note that the core layers for forward() are not defined here, this
|
||||
only defines the layers for the output heads. Those layers for
|
||||
forward() should be defined in subclasses of SACModel.
|
||||
"""
|
||||
|
||||
if tfp is None:
|
||||
raise ImportError("tensorflow-probability package not found")
|
||||
|
||||
super(SACModel, self).__init__(obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
|
||||
self.action_dim = np.product(action_space.shape)
|
||||
self.model_out = tf.keras.layers.Input(
|
||||
shape=(num_outputs, ), name="model_out")
|
||||
self.actions = tf.keras.layers.Input(
|
||||
shape=(self.action_dim, ), name="actions")
|
||||
|
||||
shift_and_log_scale_diag = tf.keras.Sequential([
|
||||
tf.keras.layers.Dense(
|
||||
units=hidden,
|
||||
activation=getattr(tf.nn, actor_hidden_activation),
|
||||
name="action_hidden_{}".format(i))
|
||||
for i, hidden in enumerate(actor_hiddens)
|
||||
] + [
|
||||
tf.keras.layers.Dense(
|
||||
units=tfp.layers.MultivariateNormalTriL.params_size(
|
||||
self.action_dim),
|
||||
activation=None,
|
||||
name="action_out")
|
||||
])(self.model_out)
|
||||
|
||||
shift, log_scale_diag = tf.keras.layers.Lambda(
|
||||
lambda shift_and_log_scale_diag: tf.split(
|
||||
shift_and_log_scale_diag,
|
||||
num_or_size_splits=2,
|
||||
axis=-1)
|
||||
)(shift_and_log_scale_diag)
|
||||
|
||||
log_scale_diag = tf.keras.layers.Lambda(
|
||||
lambda log_sd: tf.clip_by_value(log_sd, *SCALE_DIAG_MIN_MAX))(
|
||||
log_scale_diag)
|
||||
|
||||
shift_and_log_scale_diag = tf.keras.layers.Concatenate(axis=-1)(
|
||||
[shift, log_scale_diag])
|
||||
|
||||
raw_action_distribution = tfp.layers.MultivariateNormalTriL(
|
||||
self.action_dim)(shift_and_log_scale_diag)
|
||||
|
||||
action_distribution = tfp.layers.DistributionLambda(
|
||||
make_distribution_fn=SquashBijector())(raw_action_distribution)
|
||||
|
||||
# TODO(hartikainen): Remove the unnecessary Model call here
|
||||
self.action_distribution_model = tf.keras.Model(
|
||||
self.model_out, action_distribution)
|
||||
|
||||
self.register_variables(self.action_distribution_model.variables)
|
||||
|
||||
def build_q_net(name, observations, actions):
|
||||
q_net = tf.keras.Sequential([
|
||||
tf.keras.layers.Concatenate(axis=1),
|
||||
] + [
|
||||
tf.keras.layers.Dense(
|
||||
units=units,
|
||||
activation=getattr(tf.nn, critic_hidden_activation),
|
||||
name="{}_hidden_{}".format(name, i))
|
||||
for i, units in enumerate(critic_hiddens)
|
||||
] + [
|
||||
tf.keras.layers.Dense(
|
||||
units=1, activation=None, name="{}_out".format(name))
|
||||
])
|
||||
|
||||
# TODO(hartikainen): Remove the unnecessary Model call here
|
||||
q_net = tf.keras.Model([observations, actions],
|
||||
q_net([observations, actions]))
|
||||
return q_net
|
||||
|
||||
self.q_net = build_q_net("q", self.model_out, self.actions)
|
||||
self.register_variables(self.q_net.variables)
|
||||
|
||||
if twin_q:
|
||||
self.twin_q_net = build_q_net("twin_q", self.model_out,
|
||||
self.actions)
|
||||
self.register_variables(self.twin_q_net.variables)
|
||||
else:
|
||||
self.twin_q_net = None
|
||||
|
||||
self.log_alpha = tf.Variable(0.0, dtype=tf.float32, name="log_alpha")
|
||||
self.alpha = tf.exp(self.log_alpha)
|
||||
|
||||
self.register_variables([self.log_alpha])
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
"""This generates the model_out tensor input.
|
||||
|
||||
You must implement this as documented in modelv2.py."""
|
||||
raise NotImplementedError
|
||||
|
||||
def get_policy_output(self, model_out, deterministic=False):
|
||||
"""Return the (unscaled) output of the policy network.
|
||||
|
||||
This returns the unscaled outputs of pi(s).
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): obs embeddings from the model layers, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
|
||||
Returns:
|
||||
tensor of shape [BATCH_SIZE, action_dim] with range [-inf, inf].
|
||||
"""
|
||||
action_distribution = self.action_distribution_model(model_out)
|
||||
if deterministic:
|
||||
actions = action_distribution.bijector(
|
||||
action_distribution.distribution.mean())
|
||||
log_pis = None
|
||||
else:
|
||||
actions = action_distribution.sample()
|
||||
log_pis = action_distribution.log_prob(actions)
|
||||
|
||||
return actions, log_pis
|
||||
|
||||
def get_q_values(self, model_out, actions):
|
||||
"""Return the Q estimates for the most recent forward pass.
|
||||
|
||||
This implements Q(s, a).
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): obs embeddings from the model layers, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
actions (Tensor): action values that correspond with the most
|
||||
recent batch of observations passed through forward(), of shape
|
||||
[BATCH_SIZE, action_dim].
|
||||
|
||||
Returns:
|
||||
tensor of shape [BATCH_SIZE].
|
||||
"""
|
||||
return self.q_net([model_out, actions])
|
||||
|
||||
def get_twin_q_values(self, model_out, actions):
|
||||
"""Same as get_q_values but using the twin Q net.
|
||||
|
||||
This implements the twin Q(s, a).
|
||||
|
||||
Arguments:
|
||||
model_out (Tensor): obs embeddings from the model layers, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
actions (Tensor): action values that correspond with the most
|
||||
recent batch of observations passed through forward(), of shape
|
||||
[BATCH_SIZE, action_dim].
|
||||
|
||||
Returns:
|
||||
tensor of shape [BATCH_SIZE].
|
||||
"""
|
||||
return self.twin_q_net([model_out, actions])
|
||||
|
||||
def policy_variables(self):
|
||||
"""Return the list of variables for the policy net."""
|
||||
|
||||
return list(self.action_distribution_model.variables)
|
||||
|
||||
def q_variables(self):
|
||||
"""Return the list of variables for Q / twin Q nets."""
|
||||
|
||||
return self.q_net.variables + (self.twin_q_net.variables
|
||||
if self.twin_q_net else [])
|
||||
@@ -1,367 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from gym.spaces import Box
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
import ray
|
||||
import ray.experimental.tf_utils
|
||||
from ray.rllib.agents.sac.sac_model import SACModel
|
||||
from ray.rllib.agents.ddpg.noop_model import NoopModel
|
||||
from ray.rllib.agents.dqn.dqn_policy import _postprocess_dqn, PRIO_WEIGHTS
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.utils import try_import_tf, try_import_tfp
|
||||
from ray.rllib.utils.tf_ops import minimize_and_clip
|
||||
|
||||
tf = try_import_tf()
|
||||
tfp = try_import_tfp()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_sac_model(policy, obs_space, action_space, config):
|
||||
if config["model"]["custom_model"]:
|
||||
logger.warning(
|
||||
"Setting use_state_preprocessor=True since a custom model "
|
||||
"was specified.")
|
||||
config["use_state_preprocessor"] = True
|
||||
if not isinstance(action_space, Box):
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space {} is not supported for SAC.".format(action_space))
|
||||
if len(action_space.shape) > 1:
|
||||
raise UnsupportedSpaceException(
|
||||
"Action space has multiple dimensions "
|
||||
"{}. ".format(action_space.shape) +
|
||||
"Consider reshaping this into a single dimension, "
|
||||
"using a Tuple action space, or the multi-agent API.")
|
||||
|
||||
if config["use_state_preprocessor"]:
|
||||
default_model = None # catalog decides
|
||||
num_outputs = 256 # arbitrary
|
||||
config["model"]["no_final_linear"] = True
|
||||
else:
|
||||
default_model = NoopModel
|
||||
num_outputs = int(np.product(obs_space.shape))
|
||||
|
||||
policy.model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
model_interface=SACModel,
|
||||
default_model=default_model,
|
||||
name="sac_model",
|
||||
actor_hidden_activation=config["policy_model"]["hidden_activation"],
|
||||
actor_hiddens=config["policy_model"]["hidden_layer_sizes"],
|
||||
critic_hidden_activation=config["Q_model"]["hidden_activation"],
|
||||
critic_hiddens=config["Q_model"]["hidden_layer_sizes"],
|
||||
twin_q=config["twin_q"])
|
||||
|
||||
policy.target_model = ModelCatalog.get_model_v2(
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
config["model"],
|
||||
framework="tf",
|
||||
model_interface=SACModel,
|
||||
default_model=default_model,
|
||||
name="target_sac_model",
|
||||
actor_hidden_activation=config["policy_model"]["hidden_activation"],
|
||||
actor_hiddens=config["policy_model"]["hidden_layer_sizes"],
|
||||
critic_hidden_activation=config["Q_model"]["hidden_activation"],
|
||||
critic_hiddens=config["Q_model"]["hidden_layer_sizes"],
|
||||
twin_q=config["twin_q"])
|
||||
|
||||
return policy.model
|
||||
|
||||
|
||||
def postprocess_trajectory(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
return _postprocess_dqn(policy, sample_batch)
|
||||
|
||||
|
||||
def exploration_setting_inputs(policy):
|
||||
return {
|
||||
policy.stochastic: policy.config["exploration_enabled"],
|
||||
}
|
||||
|
||||
|
||||
def build_action_output(policy, model, input_dict, obs_space, action_space,
|
||||
config):
|
||||
model_out, _ = model({
|
||||
"obs": input_dict[SampleBatch.CUR_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
def unsquash_actions(actions):
|
||||
# Use sigmoid to scale to [0,1], but also double magnitude of input to
|
||||
# emulate behaviour of tanh activation used in SAC and TD3 papers.
|
||||
sigmoid_out = tf.nn.sigmoid(2 * actions)
|
||||
# Rescale to actual env policy scale
|
||||
# (shape of sigmoid_out is [batch_size, dim_actions], so we reshape to
|
||||
# get same dims)
|
||||
action_range = (action_space.high - action_space.low)[None]
|
||||
low_action = action_space.low[None]
|
||||
unsquashed_actions = action_range * sigmoid_out + low_action
|
||||
|
||||
return unsquashed_actions
|
||||
|
||||
squashed_stochastic_actions, log_pis = policy.model.get_policy_output(
|
||||
model_out, deterministic=False)
|
||||
stochastic_actions = unsquash_actions(squashed_stochastic_actions)
|
||||
squashed_deterministic_actions, _ = policy.model.get_policy_output(
|
||||
model_out, deterministic=True)
|
||||
deterministic_actions = unsquash_actions(squashed_deterministic_actions)
|
||||
|
||||
actions = tf.cond(policy.stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
|
||||
action_probabilities = tf.cond(policy.stochastic, lambda: log_pis,
|
||||
lambda: tf.zeros_like(log_pis))
|
||||
policy.output_actions = actions
|
||||
return actions, action_probabilities
|
||||
|
||||
|
||||
def actor_critic_loss(policy, batch_tensors):
|
||||
model_out_t, _ = policy.model({
|
||||
"obs": batch_tensors[SampleBatch.CUR_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
model_out_tp1, _ = policy.model({
|
||||
"obs": batch_tensors[SampleBatch.NEXT_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
|
||||
target_model_out_tp1, _ = policy.target_model({
|
||||
"obs": batch_tensors[SampleBatch.NEXT_OBS],
|
||||
"is_training": policy._get_is_training_placeholder(),
|
||||
}, [], None)
|
||||
# TODO(hartikainen): figure actions and log pis
|
||||
policy_t, log_pis_t = policy.model.get_policy_output(model_out_t)
|
||||
policy_tp1, log_pis_tp1 = policy.model.get_policy_output(model_out_tp1)
|
||||
|
||||
log_alpha = policy.model.log_alpha
|
||||
alpha = policy.model.alpha
|
||||
|
||||
# q network evaluation
|
||||
q_t = policy.model.get_q_values(model_out_t,
|
||||
batch_tensors[SampleBatch.ACTIONS])
|
||||
if policy.config["twin_q"]:
|
||||
twin_q_t = policy.model.get_twin_q_values(
|
||||
model_out_t, batch_tensors[SampleBatch.ACTIONS])
|
||||
|
||||
# Q-values for current policy (no noise) in given current state
|
||||
q_t_det_policy = policy.model.get_q_values(model_out_t, policy_t)
|
||||
|
||||
# target q network evaluation
|
||||
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1, policy_tp1)
|
||||
if policy.config["twin_q"]:
|
||||
twin_q_tp1 = policy.target_model.get_twin_q_values(
|
||||
target_model_out_tp1, policy_tp1)
|
||||
|
||||
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
|
||||
if policy.config["twin_q"]:
|
||||
twin_q_t_selected = tf.squeeze(twin_q_t, axis=len(q_t.shape) - 1)
|
||||
q_tp1 = tf.minimum(q_tp1, twin_q_tp1)
|
||||
|
||||
q_tp1 -= tf.expand_dims(alpha * log_pis_t, 1)
|
||||
|
||||
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - tf.cast(batch_tensors[SampleBatch.DONES],
|
||||
tf.float32)) * q_tp1_best
|
||||
|
||||
assert policy.config["n_step"] == 1, "TODO(hartikainen) n_step > 1"
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = tf.stop_gradient(
|
||||
batch_tensors[SampleBatch.REWARDS] +
|
||||
policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
if policy.config["twin_q"]:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
twin_td_error = twin_q_t_selected - q_t_selected_target
|
||||
td_error = td_error + twin_td_error
|
||||
errors = 0.5 * (tf.square(td_error) + tf.square(twin_td_error))
|
||||
else:
|
||||
td_error = q_t_selected - q_t_selected_target
|
||||
errors = 0.5 * tf.square(td_error)
|
||||
|
||||
critic_loss = policy.model.custom_loss(
|
||||
tf.reduce_mean(batch_tensors[PRIO_WEIGHTS] * errors), batch_tensors)
|
||||
actor_loss = tf.reduce_mean(alpha * log_pis_t - q_t_det_policy)
|
||||
|
||||
target_entropy = (-np.prod(policy.action_space.shape)
|
||||
if policy.config["target_entropy"] == "auto" else
|
||||
policy.config["target_entropy"])
|
||||
alpha_loss = -tf.reduce_mean(
|
||||
log_alpha * tf.stop_gradient(log_pis_t + target_entropy))
|
||||
|
||||
# save for stats function
|
||||
policy.q_t = q_t
|
||||
policy.td_error = td_error
|
||||
policy.actor_loss = actor_loss
|
||||
policy.critic_loss = critic_loss
|
||||
policy.alpha_loss = alpha_loss
|
||||
|
||||
# in a custom apply op we handle the losses separately, but return them
|
||||
# combined in one loss for now
|
||||
return actor_loss + critic_loss + alpha_loss
|
||||
|
||||
|
||||
def gradients(policy, optimizer, loss):
|
||||
if policy.config["grad_norm_clipping"] is not None:
|
||||
actor_grads_and_vars = minimize_and_clip(
|
||||
policy._actor_optimizer,
|
||||
policy.actor_loss,
|
||||
var_list=policy.model.policy_variables(),
|
||||
clip_val=policy.config["grad_norm_clipping"])
|
||||
critic_grads_and_vars = minimize_and_clip(
|
||||
policy._critic_optimizer,
|
||||
policy.critic_loss,
|
||||
var_list=policy.model.q_variables(),
|
||||
clip_val=policy.config["grad_norm_clipping"])
|
||||
alpha_grads_and_vars = minimize_and_clip(
|
||||
policy._alpha_optimizer,
|
||||
policy.alpha_loss,
|
||||
var_list=policy.model.alpha,
|
||||
clip_val=policy.config["grad_norm_clipping"])
|
||||
else:
|
||||
actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
|
||||
policy.actor_loss, var_list=policy.model.policy_variables())
|
||||
critic_grads_and_vars = policy._critic_optimizer.compute_gradients(
|
||||
policy.critic_loss, var_list=policy.model.q_variables())
|
||||
alpha_grads_and_vars = policy._critic_optimizer.compute_gradients(
|
||||
policy.alpha_loss, var_list=policy.model.alpha)
|
||||
# save these for later use in build_apply_op
|
||||
policy._actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
|
||||
if g is not None]
|
||||
policy._critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
|
||||
if g is not None]
|
||||
policy._alpha_grads_and_vars = [(g, v) for (g, v) in alpha_grads_and_vars
|
||||
if g is not None]
|
||||
grads_and_vars = (
|
||||
policy._actor_grads_and_vars + policy._critic_grads_and_vars +
|
||||
policy._alpha_grads_and_vars)
|
||||
return grads_and_vars
|
||||
|
||||
|
||||
def stats(policy, batch_tensors):
|
||||
return {
|
||||
"td_error": tf.reduce_mean(policy.td_error),
|
||||
"actor_loss": tf.reduce_mean(policy.actor_loss),
|
||||
"critic_loss": tf.reduce_mean(policy.critic_loss),
|
||||
"mean_q": tf.reduce_mean(policy.q_t),
|
||||
"max_q": tf.reduce_max(policy.q_t),
|
||||
"min_q": tf.reduce_min(policy.q_t),
|
||||
}
|
||||
|
||||
|
||||
class ExplorationStateMixin(object):
|
||||
def __init__(self, obs_space, action_space, config):
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
|
||||
def set_epsilon(self, epsilon):
|
||||
pass
|
||||
|
||||
|
||||
class TargetNetworkMixin(object):
|
||||
def __init__(self, config):
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
self.tau_value = config.get("tau")
|
||||
self.tau = tf.placeholder(tf.float32, (), name="tau")
|
||||
update_target_expr = []
|
||||
model_vars = self.model.trainable_variables()
|
||||
target_model_vars = self.target_model.trainable_variables()
|
||||
assert len(model_vars) == len(target_model_vars), \
|
||||
(model_vars, target_model_vars)
|
||||
for var, var_target in zip(model_vars, target_model_vars):
|
||||
update_target_expr.append(
|
||||
var_target.assign(self.tau * var +
|
||||
(1.0 - self.tau) * var_target))
|
||||
logger.debug("Update target op {}".format(var_target))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
# support both hard and soft sync
|
||||
def update_target(self, tau=None):
|
||||
tau = tau or self.tau_value
|
||||
return self.get_session().run(
|
||||
self.update_target_expr, feed_dict={self.tau: tau})
|
||||
|
||||
|
||||
class ActorCriticOptimizerMixin(object):
|
||||
def __init__(self, config):
|
||||
# create global step for counting the number of update operations
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
|
||||
# use separate optimizers for actor & critic
|
||||
self._actor_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["optimization"]["actor_learning_rate"])
|
||||
self._critic_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["optimization"]["critic_learning_rate"])
|
||||
self._alpha_optimizer = tf.train.AdamOptimizer(
|
||||
learning_rate=config["optimization"]["entropy_learning_rate"])
|
||||
|
||||
|
||||
class ComputeTDErrorMixin(object):
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
if not self.loss_initialized():
|
||||
return np.zeros_like(rew_t)
|
||||
|
||||
td_err = self.get_session().run(
|
||||
self.td_error,
|
||||
feed_dict={
|
||||
self.get_placeholder(SampleBatch.CUR_OBS): [
|
||||
np.array(ob) for ob in obs_t
|
||||
],
|
||||
self.get_placeholder(SampleBatch.ACTIONS): act_t,
|
||||
self.get_placeholder(SampleBatch.REWARDS): rew_t,
|
||||
self.get_placeholder(SampleBatch.NEXT_OBS): [
|
||||
np.array(ob) for ob in obs_tp1
|
||||
],
|
||||
self.get_placeholder(SampleBatch.DONES): done_mask,
|
||||
self.get_placeholder(PRIO_WEIGHTS): importance_weights
|
||||
})
|
||||
return td_err
|
||||
|
||||
|
||||
def setup_early_mixins(policy, obs_space, action_space, config):
|
||||
ExplorationStateMixin.__init__(policy, obs_space, action_space, config)
|
||||
ActorCriticOptimizerMixin.__init__(policy, config)
|
||||
|
||||
|
||||
def setup_late_mixins(policy, obs_space, action_space, config):
|
||||
TargetNetworkMixin.__init__(policy, config)
|
||||
|
||||
|
||||
SACTFPolicy = build_tf_policy(
|
||||
name="SACTFPolicy",
|
||||
get_default_config=lambda: ray.rllib.agents.sac.sac.DEFAULT_CONFIG,
|
||||
make_model=build_sac_model,
|
||||
postprocess_fn=postprocess_trajectory,
|
||||
extra_action_feed_fn=exploration_setting_inputs,
|
||||
action_sampler_fn=build_action_output,
|
||||
loss_fn=actor_critic_loss,
|
||||
stats_fn=stats,
|
||||
gradients_fn=gradients,
|
||||
extra_learn_fetches_fn=lambda policy: {"td_error": policy.td_error},
|
||||
mixins=[
|
||||
TargetNetworkMixin, ExplorationStateMixin, ActorCriticOptimizerMixin,
|
||||
ComputeTDErrorMixin
|
||||
],
|
||||
before_init=setup_early_mixins,
|
||||
after_init=setup_late_mixins,
|
||||
obs_include_prev_action_reward=False)
|
||||
@@ -1,797 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from datetime import datetime
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
import pickle
|
||||
import six
|
||||
import time
|
||||
import tempfile
|
||||
|
||||
import ray
|
||||
from ray.exceptions import RayError
|
||||
from ray.rllib.models import MODEL_DEFAULTS
|
||||
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
|
||||
from ray.rllib.evaluation.worker_set import WorkerSet
|
||||
from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI
|
||||
from ray.rllib.utils import FilterManager, deep_update, merge_dicts
|
||||
from ray.rllib.utils.memory import ray_get_and_free
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.tune.registry import ENV_CREATOR, register_env, _global_registry
|
||||
from ray.tune.trainable import Trainable
|
||||
from ray.tune.trial import ExportFormat
|
||||
from ray.tune.resources import Resources
|
||||
from ray.tune.logger import UnifiedLogger
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Max number of times to retry a worker failure. We shouldn't try too many
|
||||
# times in a row since that would indicate a persistent cluster issue.
|
||||
MAX_WORKER_FAILURE_RETRIES = 3
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
COMMON_CONFIG = {
|
||||
# === Debugging ===
|
||||
# Whether to write episode stats and videos to the agent log dir
|
||||
"monitor": False,
|
||||
# Set the ray.rllib.* log level for the agent process and its workers.
|
||||
# Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also
|
||||
# periodically print out summaries of relevant internal dataflow (this is
|
||||
# also printed out once at startup at the INFO level).
|
||||
"log_level": "INFO",
|
||||
# Callbacks that will be run during various phases of training. These all
|
||||
# take a single "info" dict as an argument. For episode callbacks, custom
|
||||
# metrics can be attached to the episode by updating the episode object's
|
||||
# custom metrics dict (see examples/custom_metrics_and_callbacks.py). You
|
||||
# may also mutate the passed in batch data in your callback.
|
||||
"callbacks": {
|
||||
"on_episode_start": None, # arg: {"env": .., "episode": ...}
|
||||
"on_episode_step": None, # arg: {"env": .., "episode": ...}
|
||||
"on_episode_end": None, # arg: {"env": .., "episode": ...}
|
||||
"on_sample_end": None, # arg: {"samples": .., "worker": ...}
|
||||
"on_train_result": None, # arg: {"trainer": ..., "result": ...}
|
||||
"on_postprocess_traj": None, # arg: {
|
||||
# "agent_id": ..., "episode": ...,
|
||||
# "pre_batch": (before processing),
|
||||
# "post_batch": (after processing),
|
||||
# "all_pre_batches": (other agent ids),
|
||||
# }
|
||||
},
|
||||
# Whether to attempt to continue training if a worker crashes.
|
||||
"ignore_worker_failures": False,
|
||||
# Log system resource metrics to results.
|
||||
"log_sys_usage": True,
|
||||
|
||||
# === Policy ===
|
||||
# Arguments to pass to model. See models/catalog.py for a full list of the
|
||||
# available model options.
|
||||
"model": MODEL_DEFAULTS,
|
||||
# Arguments to pass to the policy optimizer. These vary by optimizer.
|
||||
"optimizer": {},
|
||||
|
||||
# === Environment ===
|
||||
# Discount factor of the MDP
|
||||
"gamma": 0.99,
|
||||
# Number of steps after which the episode is forced to terminate. Defaults
|
||||
# to `env.spec.max_episode_steps` (if present) for Gym envs.
|
||||
"horizon": None,
|
||||
# Calculate rewards but don't reset the environment when the horizon is
|
||||
# hit. This allows value estimation and RNN state to span across logical
|
||||
# episodes denoted by horizon. This only has an effect if horizon != inf.
|
||||
"soft_horizon": False,
|
||||
# Don't set 'done' at the end of the episode. Note that you still need to
|
||||
# set this if soft_horizon=True, unless your env is actually running
|
||||
# forever without returning done=True.
|
||||
"no_done_at_end": False,
|
||||
# Arguments to pass to the env creator
|
||||
"env_config": {},
|
||||
# Environment name can also be passed via config
|
||||
"env": None,
|
||||
# Whether to clip rewards prior to experience postprocessing. Setting to
|
||||
# None means clip for Atari only.
|
||||
"clip_rewards": None,
|
||||
# Whether to np.clip() actions to the action space low/high range spec.
|
||||
"clip_actions": True,
|
||||
# Whether to use rllib or deepmind preprocessors by default
|
||||
"preprocessor_pref": "deepmind",
|
||||
# The default learning rate
|
||||
"lr": 0.0001,
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with every `evaluation_interval` training iterations.
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
# Note that evaluation is currently not parallelized, and that for Ape-X
|
||||
# metrics are already only reported for the lowest epsilon workers.
|
||||
"evaluation_interval": None,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 10,
|
||||
# Extra arguments to pass to evaluation workers.
|
||||
# Typical usage is to pass extra args to evaluation env creator
|
||||
# and to disable exploration by computing deterministic actions
|
||||
# TODO(kismuz): implement determ. actions and include relevant keys hints
|
||||
"evaluation_config": {},
|
||||
|
||||
# === Resources ===
|
||||
# Number of actors used for parallelism
|
||||
"num_workers": 2,
|
||||
# Number of GPUs to allocate to the driver. Note that not all algorithms
|
||||
# can take advantage of driver GPUs. This can be fraction (e.g., 0.3 GPUs).
|
||||
"num_gpus": 0,
|
||||
# Number of CPUs to allocate per worker.
|
||||
"num_cpus_per_worker": 1,
|
||||
# Number of GPUs to allocate per worker. This can be fractional.
|
||||
"num_gpus_per_worker": 0,
|
||||
# Any custom resources to allocate per worker.
|
||||
"custom_resources_per_worker": {},
|
||||
# Number of CPUs to allocate for the driver. Note: this only takes effect
|
||||
# when running in Tune.
|
||||
"num_cpus_for_driver": 1,
|
||||
|
||||
# === Execution ===
|
||||
# Number of environments to evaluate vectorwise per worker.
|
||||
"num_envs_per_worker": 1,
|
||||
# Default sample batch size (unroll length). Batches of this size are
|
||||
# collected from workers until train_batch_size is met. When using
|
||||
# multiple envs per worker, this is multiplied by num_envs_per_worker.
|
||||
"sample_batch_size": 200,
|
||||
# Training batch size, if applicable. Should be >= sample_batch_size.
|
||||
# Samples batches will be concatenated together to this size for training.
|
||||
"train_batch_size": 200,
|
||||
# Whether to rollout "complete_episodes" or "truncate_episodes"
|
||||
"batch_mode": "truncate_episodes",
|
||||
# Use a background thread for sampling (slightly off-policy, usually not
|
||||
# advisable to turn on unless your env specifically requires it)
|
||||
"sample_async": False,
|
||||
# Element-wise observation filter, either "NoFilter" or "MeanStdFilter"
|
||||
"observation_filter": "NoFilter",
|
||||
# Whether to synchronize the statistics of remote filters.
|
||||
"synchronize_filters": True,
|
||||
# Configure TF for single-process operation by default
|
||||
"tf_session_args": {
|
||||
# note: overriden by `local_tf_session_args`
|
||||
"intra_op_parallelism_threads": 2,
|
||||
"inter_op_parallelism_threads": 2,
|
||||
"gpu_options": {
|
||||
"allow_growth": True,
|
||||
},
|
||||
"log_device_placement": False,
|
||||
"device_count": {
|
||||
"CPU": 1
|
||||
},
|
||||
"allow_soft_placement": True, # required by PPO multi-gpu
|
||||
},
|
||||
# Override the following tf session args on the local worker
|
||||
"local_tf_session_args": {
|
||||
# Allow a higher level of parallelism by default, but not unlimited
|
||||
# since that can cause crashes with many concurrent drivers.
|
||||
"intra_op_parallelism_threads": 8,
|
||||
"inter_op_parallelism_threads": 8,
|
||||
},
|
||||
# Whether to LZ4 compress individual observations
|
||||
"compress_observations": False,
|
||||
# Wait for metric batches for at most this many seconds. Those that
|
||||
# have not returned in time will be collected in the next iteration.
|
||||
"collect_metrics_timeout": 180,
|
||||
# Smooth metrics over this many episodes.
|
||||
"metrics_smoothing_episodes": 100,
|
||||
# If using num_envs_per_worker > 1, whether to create those new envs in
|
||||
# remote processes instead of in the same worker. This adds overheads, but
|
||||
# can make sense if your envs can take much time to step / reset
|
||||
# (e.g., for StarCraft). Use this cautiously; overheads are significant.
|
||||
"remote_worker_envs": False,
|
||||
# Timeout that remote workers are waiting when polling environments.
|
||||
# 0 (continue when at least one env is ready) is a reasonable default,
|
||||
# but optimal value could be obtained by measuring your environment
|
||||
# step / reset and model inference perf.
|
||||
"remote_env_batch_wait_ms": 0,
|
||||
# Minimum time per iteration
|
||||
"min_iter_time_s": 0,
|
||||
# Minimum env steps to optimize for per train call. This value does
|
||||
# not affect learning, only the length of iterations.
|
||||
"timesteps_per_iteration": 0,
|
||||
# This argument, in conjunction with worker_index, sets the random seed of
|
||||
# each worker, so that identically configured trials will have identical
|
||||
# results. This makes experiments reproducible.
|
||||
"seed": None,
|
||||
|
||||
# === Offline Datasets ===
|
||||
# Specify how to generate experiences:
|
||||
# - "sampler": generate experiences via online simulation (default)
|
||||
# - a local directory or file glob expression (e.g., "/tmp/*.json")
|
||||
# - a list of individual file paths/URIs (e.g., ["/tmp/1.json",
|
||||
# "s3://bucket/2.json"])
|
||||
# - a dict with string keys and sampling probabilities as values (e.g.,
|
||||
# {"sampler": 0.4, "/tmp/*.json": 0.4, "s3://bucket/expert.json": 0.2}).
|
||||
# - a function that returns a rllib.offline.InputReader
|
||||
"input": "sampler",
|
||||
# Specify how to evaluate the current policy. This only has an effect when
|
||||
# reading offline experiences. Available options:
|
||||
# - "wis": the weighted step-wise importance sampling estimator.
|
||||
# - "is": the step-wise importance sampling estimator.
|
||||
# - "simulation": run the environment in the background, but use
|
||||
# this data for evaluation only and not for learning.
|
||||
"input_evaluation": ["is", "wis"],
|
||||
# Whether to run postprocess_trajectory() on the trajectory fragments from
|
||||
# offline inputs. Note that postprocessing will be done using the *current*
|
||||
# policy, not the *behaviour* policy, which is typically undesirable for
|
||||
# on-policy algorithms.
|
||||
"postprocess_inputs": False,
|
||||
# If positive, input batches will be shuffled via a sliding window buffer
|
||||
# of this number of batches. Use this if the input data is not in random
|
||||
# enough order. Input is delayed until the shuffle buffer is filled.
|
||||
"shuffle_buffer_size": 0,
|
||||
# Specify where experiences should be saved:
|
||||
# - None: don't save any experiences
|
||||
# - "logdir" to save to the agent log dir
|
||||
# - a path/URI to save to a custom output directory (e.g., "s3://bucket/")
|
||||
# - a function that returns a rllib.offline.OutputWriter
|
||||
"output": None,
|
||||
# What sample batch columns to LZ4 compress in the output data.
|
||||
"output_compress_columns": ["obs", "new_obs"],
|
||||
# Max output file size before rolling over to a new file.
|
||||
"output_max_file_size": 64 * 1024 * 1024,
|
||||
|
||||
# === Multiagent ===
|
||||
"multiagent": {
|
||||
# Map from policy ids to tuples of (policy_cls, obs_space,
|
||||
# act_space, config). See rollout_worker.py for more info.
|
||||
"policies": {},
|
||||
# Function mapping agent ids to policy ids.
|
||||
"policy_mapping_fn": None,
|
||||
# Optional whitelist of policies to train, or None for all policies.
|
||||
"policies_to_train": None,
|
||||
},
|
||||
}
|
||||
# __sphinx_doc_end__
|
||||
# yapf: enable
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def with_common_config(extra_config):
|
||||
"""Returns the given config dict merged with common agent confs."""
|
||||
|
||||
return with_base_config(COMMON_CONFIG, extra_config)
|
||||
|
||||
|
||||
def with_base_config(base_config, extra_config):
|
||||
"""Returns the given config dict merged with a base agent conf."""
|
||||
|
||||
config = copy.deepcopy(base_config)
|
||||
config.update(extra_config)
|
||||
return config
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class Trainer(Trainable):
|
||||
"""A trainer coordinates the optimization of one or more RL policies.
|
||||
|
||||
All RLlib trainers extend this base class, e.g., the A3CTrainer implements
|
||||
the A3C algorithm for single and multi-agent training.
|
||||
|
||||
Trainer objects retain internal model state between calls to train(), so
|
||||
you should create a new trainer instance for each training session.
|
||||
|
||||
Attributes:
|
||||
env_creator (func): Function that creates a new training env.
|
||||
config (obj): Algorithm-specific configuration data.
|
||||
logdir (str): Directory in which training outputs should be placed.
|
||||
"""
|
||||
|
||||
_allow_unknown_configs = False
|
||||
_allow_unknown_subkeys = [
|
||||
"tf_session_args", "env_config", "model", "optimizer", "multiagent",
|
||||
"custom_resources_per_worker", "evaluation_config"
|
||||
]
|
||||
|
||||
@PublicAPI
|
||||
def __init__(self, config=None, env=None, logger_creator=None):
|
||||
"""Initialize an RLLib trainer.
|
||||
|
||||
Args:
|
||||
config (dict): Algorithm-specific configuration data.
|
||||
env (str): Name of the environment to use. Note that this can also
|
||||
be specified as the `env` key in config.
|
||||
logger_creator (func): Function that creates a ray.tune.Logger
|
||||
object. If unspecified, a default logger is created.
|
||||
"""
|
||||
|
||||
config = config or {}
|
||||
|
||||
# Vars to synchronize to workers on each train call
|
||||
self.global_vars = {"timestep": 0}
|
||||
|
||||
# Trainers allow env ids to be passed directly to the constructor.
|
||||
self._env_id = self._register_if_needed(env or config.get("env"))
|
||||
|
||||
# Create a default logger creator if no logger_creator is specified
|
||||
if logger_creator is None:
|
||||
timestr = datetime.today().strftime("%Y-%m-%d_%H-%M-%S")
|
||||
logdir_prefix = "{}_{}_{}".format(self._name, self._env_id,
|
||||
timestr)
|
||||
|
||||
def default_logger_creator(config):
|
||||
"""Creates a Unified logger with a default logdir prefix
|
||||
containing the agent name and the env id
|
||||
"""
|
||||
if not os.path.exists(DEFAULT_RESULTS_DIR):
|
||||
os.makedirs(DEFAULT_RESULTS_DIR)
|
||||
logdir = tempfile.mkdtemp(
|
||||
prefix=logdir_prefix, dir=DEFAULT_RESULTS_DIR)
|
||||
return UnifiedLogger(config, logdir, None)
|
||||
|
||||
logger_creator = default_logger_creator
|
||||
|
||||
Trainable.__init__(self, config, logger_creator)
|
||||
|
||||
@classmethod
|
||||
@override(Trainable)
|
||||
def default_resource_request(cls, config):
|
||||
cf = dict(cls._default_config, **config)
|
||||
Trainer._validate_config(cf)
|
||||
# TODO(ekl): add custom resources here once tune supports them
|
||||
return Resources(
|
||||
cpu=cf["num_cpus_for_driver"],
|
||||
gpu=cf["num_gpus"],
|
||||
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
|
||||
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
|
||||
|
||||
@override(Trainable)
|
||||
@PublicAPI
|
||||
def train(self):
|
||||
"""Overrides super.train to synchronize global vars."""
|
||||
|
||||
if self._has_policy_optimizer():
|
||||
self.global_vars["timestep"] = self.optimizer.num_steps_sampled
|
||||
self.optimizer.workers.local_worker().set_global_vars(
|
||||
self.global_vars)
|
||||
for w in self.optimizer.workers.remote_workers():
|
||||
w.set_global_vars.remote(self.global_vars)
|
||||
logger.debug("updated global vars: {}".format(self.global_vars))
|
||||
|
||||
result = None
|
||||
for _ in range(1 + MAX_WORKER_FAILURE_RETRIES):
|
||||
try:
|
||||
result = Trainable.train(self)
|
||||
except RayError as e:
|
||||
if self.config["ignore_worker_failures"]:
|
||||
logger.exception(
|
||||
"Error in train call, attempting to recover")
|
||||
self._try_recover()
|
||||
else:
|
||||
logger.info(
|
||||
"Worker crashed during call to train(). To attempt to "
|
||||
"continue training without the failed worker, set "
|
||||
"`'ignore_worker_failures': True`.")
|
||||
raise e
|
||||
except Exception as e:
|
||||
time.sleep(0.5) # allow logs messages to propagate
|
||||
raise e
|
||||
else:
|
||||
break
|
||||
if result is None:
|
||||
raise RuntimeError("Failed to recover from worker crash")
|
||||
|
||||
if (self.config.get("observation_filter", "NoFilter") != "NoFilter"
|
||||
and hasattr(self, "workers")
|
||||
and isinstance(self.workers, WorkerSet)):
|
||||
FilterManager.synchronize(
|
||||
self.workers.local_worker().filters,
|
||||
self.workers.remote_workers(),
|
||||
update_remote=self.config["synchronize_filters"])
|
||||
logger.debug("synchronized filters: {}".format(
|
||||
self.workers.local_worker().filters))
|
||||
|
||||
if self._has_policy_optimizer():
|
||||
result["num_healthy_workers"] = len(
|
||||
self.optimizer.workers.remote_workers())
|
||||
|
||||
if self.config["evaluation_interval"]:
|
||||
if self._iteration % self.config["evaluation_interval"] == 0:
|
||||
evaluation_metrics = self._evaluate()
|
||||
assert isinstance(evaluation_metrics, dict), \
|
||||
"_evaluate() needs to return a dict."
|
||||
result.update(evaluation_metrics)
|
||||
|
||||
return result
|
||||
|
||||
@override(Trainable)
|
||||
def _log_result(self, result):
|
||||
if self.config["callbacks"].get("on_train_result"):
|
||||
self.config["callbacks"]["on_train_result"]({
|
||||
"trainer": self,
|
||||
"result": result,
|
||||
})
|
||||
# log after the callback is invoked, so that the user has a chance
|
||||
# to mutate the result
|
||||
Trainable._log_result(self, result)
|
||||
|
||||
@override(Trainable)
|
||||
def _setup(self, config):
|
||||
env = self._env_id
|
||||
if env:
|
||||
config["env"] = env
|
||||
if _global_registry.contains(ENV_CREATOR, env):
|
||||
self.env_creator = _global_registry.get(ENV_CREATOR, env)
|
||||
else:
|
||||
import gym # soft dependency
|
||||
self.env_creator = lambda env_config: gym.make(env)
|
||||
else:
|
||||
self.env_creator = lambda env_config: None
|
||||
|
||||
# Merge the supplied config with the class default
|
||||
merged_config = copy.deepcopy(self._default_config)
|
||||
merged_config = deep_update(merged_config, config,
|
||||
self._allow_unknown_configs,
|
||||
self._allow_unknown_subkeys)
|
||||
self.raw_user_config = config
|
||||
self.config = merged_config
|
||||
Trainer._validate_config(self.config)
|
||||
if self.config.get("log_level"):
|
||||
logging.getLogger("ray.rllib").setLevel(self.config["log_level"])
|
||||
|
||||
def get_scope():
|
||||
if tf:
|
||||
return tf.Graph().as_default()
|
||||
else:
|
||||
return open("/dev/null") # fake a no-op scope
|
||||
|
||||
with get_scope():
|
||||
self._init(self.config, self.env_creator)
|
||||
|
||||
# Evaluation related
|
||||
if self.config.get("evaluation_interval"):
|
||||
# Update env_config with evaluation settings:
|
||||
extra_config = copy.deepcopy(self.config["evaluation_config"])
|
||||
extra_config.update({
|
||||
"batch_mode": "complete_episodes",
|
||||
"batch_steps": 1,
|
||||
})
|
||||
logger.debug(
|
||||
"using evaluation_config: {}".format(extra_config))
|
||||
self.evaluation_workers = self._make_workers(
|
||||
self.env_creator,
|
||||
self._policy,
|
||||
merge_dicts(self.config, extra_config),
|
||||
num_workers=0)
|
||||
self.evaluation_metrics = self._evaluate()
|
||||
|
||||
@override(Trainable)
|
||||
def _stop(self):
|
||||
if hasattr(self, "workers"):
|
||||
self.workers.stop()
|
||||
if hasattr(self, "optimizer"):
|
||||
self.optimizer.stop()
|
||||
|
||||
@override(Trainable)
|
||||
def _save(self, checkpoint_dir):
|
||||
checkpoint_path = os.path.join(checkpoint_dir,
|
||||
"checkpoint-{}".format(self.iteration))
|
||||
pickle.dump(self.__getstate__(), open(checkpoint_path, "wb"))
|
||||
return checkpoint_path
|
||||
|
||||
@override(Trainable)
|
||||
def _restore(self, checkpoint_path):
|
||||
extra_data = pickle.load(open(checkpoint_path, "rb"))
|
||||
self.__setstate__(extra_data)
|
||||
|
||||
@DeveloperAPI
|
||||
def _make_workers(self, env_creator, policy, config, num_workers):
|
||||
return WorkerSet(
|
||||
env_creator,
|
||||
policy,
|
||||
config,
|
||||
num_workers=num_workers,
|
||||
logdir=self.logdir)
|
||||
|
||||
@DeveloperAPI
|
||||
def _init(self, config, env_creator):
|
||||
"""Subclasses should override this for custom initialization."""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@DeveloperAPI
|
||||
def _evaluate(self):
|
||||
"""Evaluates current policy under `evaluation_config` settings.
|
||||
|
||||
Note that this default implementation does not do anything beyond
|
||||
merging evaluation_config with the normal trainer config.
|
||||
"""
|
||||
|
||||
if not self.config["evaluation_config"]:
|
||||
raise ValueError(
|
||||
"No evaluation_config specified. It doesn't make sense "
|
||||
"to enable evaluation without specifying any config "
|
||||
"overrides, since the results will be the "
|
||||
"same as reported during normal policy evaluation.")
|
||||
|
||||
logger.info("Evaluating current policy for {} episodes".format(
|
||||
self.config["evaluation_num_episodes"]))
|
||||
self._before_evaluate()
|
||||
self.evaluation_workers.local_worker().restore(
|
||||
self.workers.local_worker().save())
|
||||
for _ in range(self.config["evaluation_num_episodes"]):
|
||||
self.evaluation_workers.local_worker().sample()
|
||||
|
||||
metrics = collect_metrics(self.evaluation_workers.local_worker())
|
||||
return {"evaluation": metrics}
|
||||
|
||||
@DeveloperAPI
|
||||
def _before_evaluate(self):
|
||||
"""Pre-evaluation callback."""
|
||||
pass
|
||||
|
||||
@PublicAPI
|
||||
def compute_action(self,
|
||||
observation,
|
||||
state=None,
|
||||
prev_action=None,
|
||||
prev_reward=None,
|
||||
info=None,
|
||||
policy_id=DEFAULT_POLICY_ID,
|
||||
full_fetch=False):
|
||||
"""Computes an action for the specified policy.
|
||||
|
||||
Note that you can also access the policy object through
|
||||
self.get_policy(policy_id) and call compute_actions() on it directly.
|
||||
|
||||
Arguments:
|
||||
observation (obj): observation from the environment.
|
||||
state (list): RNN hidden state, if any. If state is not None,
|
||||
then all of compute_single_action(...) is returned
|
||||
(computed action, rnn state, logits dictionary).
|
||||
Otherwise compute_single_action(...)[0] is
|
||||
returned (computed action).
|
||||
prev_action (obj): previous action value, if any
|
||||
prev_reward (int): previous reward, if any
|
||||
info (dict): info object, if any
|
||||
policy_id (str): policy to query (only applies to multi-agent).
|
||||
full_fetch (bool): whether to return extra action fetch results.
|
||||
This is always set to true if RNN state is specified.
|
||||
|
||||
Returns:
|
||||
Just the computed action if full_fetch=False, or the full output
|
||||
of policy.compute_actions() otherwise.
|
||||
"""
|
||||
|
||||
if state is None:
|
||||
state = []
|
||||
preprocessed = self.workers.local_worker().preprocessors[
|
||||
policy_id].transform(observation)
|
||||
filtered_obs = self.workers.local_worker().filters[policy_id](
|
||||
preprocessed, update=False)
|
||||
if state:
|
||||
return self.get_policy(policy_id).compute_single_action(
|
||||
filtered_obs,
|
||||
state,
|
||||
prev_action,
|
||||
prev_reward,
|
||||
info,
|
||||
clip_actions=self.config["clip_actions"])
|
||||
res = self.get_policy(policy_id).compute_single_action(
|
||||
filtered_obs,
|
||||
state,
|
||||
prev_action,
|
||||
prev_reward,
|
||||
info,
|
||||
clip_actions=self.config["clip_actions"])
|
||||
if full_fetch:
|
||||
return res
|
||||
else:
|
||||
return res[0] # backwards compatibility
|
||||
|
||||
@property
|
||||
def _name(self):
|
||||
"""Subclasses should override this to declare their name."""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def _default_config(self):
|
||||
"""Subclasses should override this to declare their default config."""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def get_policy(self, policy_id=DEFAULT_POLICY_ID):
|
||||
"""Return policy for the specified id, or None.
|
||||
|
||||
Arguments:
|
||||
policy_id (str): id of policy to return.
|
||||
"""
|
||||
|
||||
return self.workers.local_worker().get_policy(policy_id)
|
||||
|
||||
@PublicAPI
|
||||
def get_weights(self, policies=None):
|
||||
"""Return a dictionary of policy ids to weights.
|
||||
|
||||
Arguments:
|
||||
policies (list): Optional list of policies to return weights for,
|
||||
or None for all policies.
|
||||
"""
|
||||
return self.workers.local_worker().get_weights(policies)
|
||||
|
||||
@PublicAPI
|
||||
def set_weights(self, weights):
|
||||
"""Set policy weights by policy id.
|
||||
|
||||
Arguments:
|
||||
weights (dict): Map of policy ids to weights to set.
|
||||
"""
|
||||
self.workers.local_worker().set_weights(weights)
|
||||
|
||||
@DeveloperAPI
|
||||
def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID):
|
||||
"""Export policy model with given policy_id to local directory.
|
||||
|
||||
Arguments:
|
||||
export_dir (string): Writable local directory.
|
||||
policy_id (string): Optional policy id to export.
|
||||
|
||||
Example:
|
||||
>>> trainer = MyTrainer()
|
||||
>>> for _ in range(10):
|
||||
>>> trainer.train()
|
||||
>>> trainer.export_policy_model("/tmp/export_dir")
|
||||
"""
|
||||
self.workers.local_worker().export_policy_model(export_dir, policy_id)
|
||||
|
||||
@DeveloperAPI
|
||||
def export_policy_checkpoint(self,
|
||||
export_dir,
|
||||
filename_prefix="model",
|
||||
policy_id=DEFAULT_POLICY_ID):
|
||||
"""Export tensorflow policy model checkpoint to local directory.
|
||||
|
||||
Arguments:
|
||||
export_dir (string): Writable local directory.
|
||||
filename_prefix (string): file name prefix of checkpoint files.
|
||||
policy_id (string): Optional policy id to export.
|
||||
|
||||
Example:
|
||||
>>> trainer = MyTrainer()
|
||||
>>> for _ in range(10):
|
||||
>>> trainer.train()
|
||||
>>> trainer.export_policy_checkpoint("/tmp/export_dir")
|
||||
"""
|
||||
self.workers.local_worker().export_policy_checkpoint(
|
||||
export_dir, filename_prefix, policy_id)
|
||||
|
||||
@DeveloperAPI
|
||||
def collect_metrics(self, selected_workers=None):
|
||||
"""Collects metrics from the remote workers of this agent.
|
||||
|
||||
This is the same data as returned by a call to train().
|
||||
"""
|
||||
return self.optimizer.collect_metrics(
|
||||
self.config["collect_metrics_timeout"],
|
||||
min_history=self.config["metrics_smoothing_episodes"],
|
||||
selected_workers=selected_workers)
|
||||
|
||||
@classmethod
|
||||
def resource_help(cls, config):
|
||||
return ("\n\nYou can adjust the resource requests of RLlib agents by "
|
||||
"setting `num_workers`, `num_gpus`, and other configs. See "
|
||||
"the DEFAULT_CONFIG defined by each agent for more info.\n\n"
|
||||
"The config of this agent is: {}".format(config))
|
||||
|
||||
@staticmethod
|
||||
def _validate_config(config):
|
||||
if "policy_graphs" in config["multiagent"]:
|
||||
logger.warning(
|
||||
"The `policy_graphs` config has been renamed to `policies`.")
|
||||
# Backwards compatibility
|
||||
config["multiagent"]["policies"] = config["multiagent"][
|
||||
"policy_graphs"]
|
||||
del config["multiagent"]["policy_graphs"]
|
||||
if "gpu" in config:
|
||||
raise ValueError(
|
||||
"The `gpu` config is deprecated, please use `num_gpus=0|1` "
|
||||
"instead.")
|
||||
if "gpu_fraction" in config:
|
||||
raise ValueError(
|
||||
"The `gpu_fraction` config is deprecated, please use "
|
||||
"`num_gpus=<fraction>` instead.")
|
||||
if "use_gpu_for_workers" in config:
|
||||
raise ValueError(
|
||||
"The `use_gpu_for_workers` config is deprecated, please use "
|
||||
"`num_gpus_per_worker=1` instead.")
|
||||
if type(config["input_evaluation"]) != list:
|
||||
raise ValueError(
|
||||
"`input_evaluation` must be a list of strings, got {}".format(
|
||||
config["input_evaluation"]))
|
||||
|
||||
def _try_recover(self):
|
||||
"""Try to identify and blacklist any unhealthy workers.
|
||||
|
||||
This method is called after an unexpected remote error is encountered
|
||||
from a worker. It issues check requests to all current workers and
|
||||
blacklists any that respond with error. If no healthy workers remain,
|
||||
an error is raised.
|
||||
"""
|
||||
|
||||
if not self._has_policy_optimizer():
|
||||
raise NotImplementedError(
|
||||
"Recovery is not supported for this algorithm")
|
||||
|
||||
logger.info("Health checking all workers...")
|
||||
checks = []
|
||||
for ev in self.optimizer.workers.remote_workers():
|
||||
_, obj_id = ev.sample_with_count.remote()
|
||||
checks.append(obj_id)
|
||||
|
||||
healthy_workers = []
|
||||
for i, obj_id in enumerate(checks):
|
||||
w = self.optimizer.workers.remote_workers()[i]
|
||||
try:
|
||||
ray_get_and_free(obj_id)
|
||||
healthy_workers.append(w)
|
||||
logger.info("Worker {} looks healthy".format(i + 1))
|
||||
except RayError:
|
||||
logger.exception("Blacklisting worker {}".format(i + 1))
|
||||
try:
|
||||
w.__ray_terminate__.remote()
|
||||
except Exception:
|
||||
logger.exception("Error terminating unhealthy worker")
|
||||
|
||||
if len(healthy_workers) < 1:
|
||||
raise RuntimeError(
|
||||
"Not enough healthy workers remain to continue.")
|
||||
|
||||
self.optimizer.reset(healthy_workers)
|
||||
|
||||
def _has_policy_optimizer(self):
|
||||
return hasattr(self, "optimizer") and isinstance(
|
||||
self.optimizer, PolicyOptimizer)
|
||||
|
||||
@override(Trainable)
|
||||
def _export_model(self, export_formats, export_dir):
|
||||
ExportFormat.validate(export_formats)
|
||||
exported = {}
|
||||
if ExportFormat.CHECKPOINT in export_formats:
|
||||
path = os.path.join(export_dir, ExportFormat.CHECKPOINT)
|
||||
self.export_policy_checkpoint(path)
|
||||
exported[ExportFormat.CHECKPOINT] = path
|
||||
if ExportFormat.MODEL in export_formats:
|
||||
path = os.path.join(export_dir, ExportFormat.MODEL)
|
||||
self.export_policy_model(path)
|
||||
exported[ExportFormat.MODEL] = path
|
||||
return exported
|
||||
|
||||
def __getstate__(self):
|
||||
state = {}
|
||||
if hasattr(self, "workers"):
|
||||
state["worker"] = self.workers.local_worker().save()
|
||||
if hasattr(self, "optimizer") and hasattr(self.optimizer, "save"):
|
||||
state["optimizer"] = self.optimizer.save()
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
if "worker" in state:
|
||||
self.workers.local_worker().restore(state["worker"])
|
||||
remote_state = ray.put(state["worker"])
|
||||
for r in self.workers.remote_workers():
|
||||
r.restore.remote(remote_state)
|
||||
if "optimizer" in state:
|
||||
self.optimizer.restore(state["optimizer"])
|
||||
|
||||
def _register_if_needed(self, env_object):
|
||||
if isinstance(env_object, six.string_types):
|
||||
return env_object
|
||||
elif isinstance(env_object, type):
|
||||
name = env_object.__name__
|
||||
register_env(name, lambda config: env_object(config))
|
||||
return name
|
||||
raise ValueError(
|
||||
"{} is an invalid env specification. ".format(env_object) +
|
||||
"You can specify a custom env as either a class "
|
||||
"(e.g., YourEnvCls) or a registered env id (e.g., \"your_env\").")
|
||||
@@ -1,174 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import time
|
||||
|
||||
from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG
|
||||
from ray.rllib.optimizers import SyncSamplesOptimizer
|
||||
from ray.rllib.utils import add_mixins
|
||||
from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def build_trainer(name,
|
||||
default_policy,
|
||||
default_config=None,
|
||||
validate_config=None,
|
||||
get_initial_state=None,
|
||||
get_policy_class=None,
|
||||
before_init=None,
|
||||
make_workers=None,
|
||||
make_policy_optimizer=None,
|
||||
after_init=None,
|
||||
before_train_step=None,
|
||||
after_optimizer_step=None,
|
||||
after_train_result=None,
|
||||
collect_metrics_fn=None,
|
||||
before_evaluate_fn=None,
|
||||
mixins=None):
|
||||
"""Helper function for defining a custom trainer.
|
||||
|
||||
Functions will be run in this order to initialize the trainer:
|
||||
1. Config setup: validate_config, get_initial_state, get_policy
|
||||
2. Worker setup: before_init, make_workers, make_policy_optimizer
|
||||
3. Post setup: after_init
|
||||
|
||||
Arguments:
|
||||
name (str): name of the trainer (e.g., "PPO")
|
||||
default_policy (cls): the default Policy class to use
|
||||
default_config (dict): the default config dict of the algorithm,
|
||||
otherwises uses the Trainer default config
|
||||
validate_config (func): optional callback that checks a given config
|
||||
for correctness. It may mutate the config as needed.
|
||||
get_initial_state (func): optional function that returns the initial
|
||||
state dict given the trainer instance as an argument. The state
|
||||
dict must be serializable so that it can be checkpointed, and will
|
||||
be available as the `trainer.state` variable.
|
||||
get_policy_class (func): optional callback that takes a config and
|
||||
returns the policy class to override the default with
|
||||
before_init (func): optional function to run at the start of trainer
|
||||
init that takes the trainer instance as argument
|
||||
make_workers (func): override the method that creates rollout workers.
|
||||
This takes in (trainer, env_creator, policy, config) as args.
|
||||
make_policy_optimizer (func): optional function that returns a
|
||||
PolicyOptimizer instance given (WorkerSet, config)
|
||||
after_init (func): optional function to run at the end of trainer init
|
||||
that takes the trainer instance as argument
|
||||
before_train_step (func): optional callback to run before each train()
|
||||
call. It takes the trainer instance as an argument.
|
||||
after_optimizer_step (func): optional callback to run after each
|
||||
step() call to the policy optimizer. It takes the trainer instance
|
||||
and the policy gradient fetches as arguments.
|
||||
after_train_result (func): optional callback to run at the end of each
|
||||
train() call. It takes the trainer instance and result dict as
|
||||
arguments, and may mutate the result dict as needed.
|
||||
collect_metrics_fn (func): override the method used to collect metrics.
|
||||
It takes the trainer instance as argumnt.
|
||||
before_evaluate_fn (func): callback to run before evaluation. This
|
||||
takes the trainer instance as argument.
|
||||
mixins (list): list of any class mixins for the returned trainer class.
|
||||
These mixins will be applied in order and will have higher
|
||||
precedence than the Trainer class
|
||||
|
||||
Returns:
|
||||
a Trainer instance that uses the specified args.
|
||||
"""
|
||||
|
||||
original_kwargs = locals().copy()
|
||||
base = add_mixins(Trainer, mixins)
|
||||
|
||||
class trainer_cls(base):
|
||||
_name = name
|
||||
_default_config = default_config or COMMON_CONFIG
|
||||
_policy = default_policy
|
||||
|
||||
def __init__(self, config=None, env=None, logger_creator=None):
|
||||
Trainer.__init__(self, config, env, logger_creator)
|
||||
|
||||
def _init(self, config, env_creator):
|
||||
if validate_config:
|
||||
validate_config(config)
|
||||
if get_initial_state:
|
||||
self.state = get_initial_state(self)
|
||||
else:
|
||||
self.state = {}
|
||||
if get_policy_class is None:
|
||||
policy = default_policy
|
||||
else:
|
||||
policy = get_policy_class(config)
|
||||
if before_init:
|
||||
before_init(self)
|
||||
if make_workers:
|
||||
self.workers = make_workers(self, env_creator, policy, config)
|
||||
else:
|
||||
self.workers = self._make_workers(env_creator, policy, config,
|
||||
self.config["num_workers"])
|
||||
if make_policy_optimizer:
|
||||
self.optimizer = make_policy_optimizer(self.workers, config)
|
||||
else:
|
||||
optimizer_config = dict(
|
||||
config["optimizer"],
|
||||
**{"train_batch_size": config["train_batch_size"]})
|
||||
self.optimizer = SyncSamplesOptimizer(self.workers,
|
||||
**optimizer_config)
|
||||
if after_init:
|
||||
after_init(self)
|
||||
|
||||
@override(Trainer)
|
||||
def _train(self):
|
||||
if before_train_step:
|
||||
before_train_step(self)
|
||||
prev_steps = self.optimizer.num_steps_sampled
|
||||
|
||||
start = time.time()
|
||||
while True:
|
||||
fetches = self.optimizer.step()
|
||||
if after_optimizer_step:
|
||||
after_optimizer_step(self, fetches)
|
||||
if (time.time() - start >= self.config["min_iter_time_s"]
|
||||
and self.optimizer.num_steps_sampled - prev_steps >=
|
||||
self.config["timesteps_per_iteration"]):
|
||||
break
|
||||
|
||||
if collect_metrics_fn:
|
||||
res = collect_metrics_fn(self)
|
||||
else:
|
||||
res = self.collect_metrics()
|
||||
res.update(
|
||||
timesteps_this_iter=self.optimizer.num_steps_sampled -
|
||||
prev_steps,
|
||||
info=res.get("info", {}))
|
||||
|
||||
if after_train_result:
|
||||
after_train_result(self, res)
|
||||
return res
|
||||
|
||||
@override(Trainer)
|
||||
def _before_evaluate(self):
|
||||
if before_evaluate_fn:
|
||||
before_evaluate_fn(self)
|
||||
|
||||
def __getstate__(self):
|
||||
state = Trainer.__getstate__(self)
|
||||
state["trainer_state"] = self.state.copy()
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
Trainer.__setstate__(self, state)
|
||||
self.state = state["trainer_state"].copy()
|
||||
|
||||
@staticmethod
|
||||
def with_updates(**overrides):
|
||||
"""Build a copy of this trainer with the specified overrides.
|
||||
|
||||
Arguments:
|
||||
overrides (dict): use this to override any of the arguments
|
||||
originally passed to build_trainer() for this policy.
|
||||
"""
|
||||
return build_trainer(**dict(original_kwargs, **overrides))
|
||||
|
||||
trainer_cls.with_updates = with_updates
|
||||
trainer_cls.__name__ = name
|
||||
trainer_cls.__qualname__ = name
|
||||
return trainer_cls
|
||||
@@ -1,141 +0,0 @@
|
||||
{
|
||||
// The version of the config file format. Do not change, unless
|
||||
// you know what you are doing.
|
||||
"version": 1,
|
||||
|
||||
// The name of the project being benchmarked
|
||||
"project": "rllib",
|
||||
|
||||
// The project's homepage
|
||||
"project_url": "http://rllib.io",
|
||||
|
||||
// The URL or local path of the source code repository for the
|
||||
// project being benchmarked
|
||||
"repo": "../../../",
|
||||
|
||||
// List of branches to benchmark. If not provided, defaults to "master"
|
||||
// (for git) or "default" (for mercurial).
|
||||
"branches": ["master"], // for git
|
||||
// "branches": ["default"], // for mercurial
|
||||
|
||||
// The DVCS being used. If not set, it will be automatically
|
||||
// determined from "repo" by looking at the protocol in the URL
|
||||
// (if remote), or by looking for special directories, such as
|
||||
// ".git" (if local).
|
||||
"dvcs": "git",
|
||||
|
||||
// The tool to use to create environments. May be "conda",
|
||||
// "virtualenv" or other value depending on the plugins in use.
|
||||
// If missing or the empty string, the tool will be automatically
|
||||
// determined by looking for tools on the PATH environment
|
||||
// variable.
|
||||
"environment_type": "conda",
|
||||
|
||||
// timeout in seconds for installing any dependencies in environment
|
||||
// defaults to 10 min
|
||||
//"install_timeout": 600,
|
||||
|
||||
// the base URL to show a commit for the project.
|
||||
"show_commit_url": "http://github.com/ray-project/ray/commit/",
|
||||
|
||||
// The Pythons you'd like to test against. If not provided, defaults
|
||||
// to the current version of Python used to run `asv`.
|
||||
"pythons": ["3.6"],
|
||||
|
||||
// The matrix of dependencies to test. Each key is the name of a
|
||||
// package (in PyPI) and the values are version numbers. An empty
|
||||
// list or empty string indicates to just test against the default
|
||||
// (latest) version. null indicates that the package is to not be
|
||||
// installed. If the package to be tested is only available from
|
||||
// PyPi, and the 'environment_type' is conda, then you can preface
|
||||
// the package name by 'pip+', and the package will be installed via
|
||||
// pip (with all the conda available packages installed first,
|
||||
// followed by the pip installed packages).
|
||||
//
|
||||
// "matrix": {
|
||||
// "numpy": ["1.6", "1.7"],
|
||||
// "six": ["", null], // test with and without six installed
|
||||
// "pip+emcee": [""], // emcee is only available for install with pip.
|
||||
// },
|
||||
|
||||
// Combinations of libraries/python versions can be excluded/included
|
||||
// from the set to test. Each entry is a dictionary containing additional
|
||||
// key-value pairs to include/exclude.
|
||||
//
|
||||
// An exclude entry excludes entries where all values match. The
|
||||
// values are regexps that should match the whole string.
|
||||
//
|
||||
// An include entry adds an environment. Only the packages listed
|
||||
// are installed. The 'python' key is required. The exclude rules
|
||||
// do not apply to includes.
|
||||
//
|
||||
// In addition to package names, the following keys are available:
|
||||
//
|
||||
// - python
|
||||
// Python version, as in the *pythons* variable above.
|
||||
// - environment_type
|
||||
// Environment type, as above.
|
||||
// - sys_platform
|
||||
// Platform, as in sys.platform. Possible values for the common
|
||||
// cases: 'linux2', 'win32', 'cygwin', 'darwin'.
|
||||
//
|
||||
// "exclude": [
|
||||
// {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows
|
||||
// {"environment_type": "conda", "six": null}, // don't run without six on conda
|
||||
// ],
|
||||
//
|
||||
// "include": [
|
||||
// // additional env for python2.7
|
||||
// {"python": "2.7", "numpy": "1.8"},
|
||||
// // additional env if run on windows+conda
|
||||
// {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""},
|
||||
// ],
|
||||
|
||||
// The directory (relative to the current directory) that benchmarks are
|
||||
// stored in. If not provided, defaults to "benchmarks"
|
||||
"benchmark_dir": "tuned_examples/regression_tests",
|
||||
|
||||
// The directory (relative to the current directory) to cache the Python
|
||||
// environments in. If not provided, defaults to "env"
|
||||
// "env_dir": "env",
|
||||
|
||||
// The directory (relative to the current directory) that raw benchmark
|
||||
// results are stored in. If not provided, defaults to "results".
|
||||
"results_dir": "RLLIB_RESULTS",
|
||||
|
||||
// The directory (relative to the current directory) that the html tree
|
||||
// should be written to. If not provided, defaults to "html".
|
||||
// "html_dir": "html",
|
||||
|
||||
// The number of characters to retain in the commit hashes.
|
||||
// "hash_length": 8,
|
||||
|
||||
// `asv` will cache wheels of the recent builds in each
|
||||
// environment, making them faster to install next time. This is
|
||||
// number of builds to keep, per environment.
|
||||
// "wheel_cache_size": 0
|
||||
|
||||
// The commits after which the regression search in `asv publish`
|
||||
// should start looking for regressions. Dictionary whose keys are
|
||||
// regexps matching to benchmark names, and values corresponding to
|
||||
// the commit (exclusive) after which to start looking for
|
||||
// regressions. The default is to start from the first commit
|
||||
// with results. If the commit is `null`, regression detection is
|
||||
// skipped for the matching benchmark.
|
||||
//
|
||||
// "regressions_first_commits": {
|
||||
// "some_benchmark": "352cdf", // Consider regressions only after this commit
|
||||
// "another_benchmark": null, // Skip regression detection altogether
|
||||
// }
|
||||
|
||||
// The thresholds for relative change in results, after which `asv
|
||||
// publish` starts reporting regressions. Dictionary of the same
|
||||
// form as in ``regressions_first_commits``, with values
|
||||
// indicating the thresholds. If multiple entries match, the
|
||||
// maximum is taken. If no entry matches, the default is 5%.
|
||||
//
|
||||
// "regressions_thresholds": {
|
||||
// "some_benchmark": 0.01, // Threshold of 1%
|
||||
// "another_benchmark": 0.5, // Threshold of 50%
|
||||
// }
|
||||
}
|
||||
@@ -1,3 +0,0 @@
|
||||
Contributed algorithms, which can be run via ``rllib train --run=contrib/<alg_name>``
|
||||
|
||||
See https://ray.readthedocs.io/en/latest/rllib-dev.html for guidelines.
|
||||
@@ -1,52 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.agents.trainer import Trainer, with_common_config
|
||||
from ray.rllib.utils.annotations import override
|
||||
|
||||
|
||||
# yapf: disable
|
||||
# __sphinx_doc_begin__
|
||||
class RandomAgent(Trainer):
|
||||
"""Policy that takes random actions and never learns."""
|
||||
|
||||
_name = "RandomAgent"
|
||||
_default_config = with_common_config({
|
||||
"rollouts_per_iteration": 10,
|
||||
})
|
||||
|
||||
@override(Trainer)
|
||||
def _init(self, config, env_creator):
|
||||
self.env = env_creator(config["env_config"])
|
||||
|
||||
@override(Trainer)
|
||||
def _train(self):
|
||||
rewards = []
|
||||
steps = 0
|
||||
for _ in range(self.config["rollouts_per_iteration"]):
|
||||
obs = self.env.reset()
|
||||
done = False
|
||||
reward = 0.0
|
||||
while not done:
|
||||
action = self.env.action_space.sample()
|
||||
obs, r, done, info = self.env.step(action)
|
||||
reward += r
|
||||
steps += 1
|
||||
rewards.append(reward)
|
||||
return {
|
||||
"episode_reward_mean": np.mean(rewards),
|
||||
"timesteps_this_iter": steps,
|
||||
}
|
||||
# __sphinx_doc_end__
|
||||
# don't enable yapf after, it's buggy here
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
trainer = RandomAgent(
|
||||
env="CartPole-v0", config={"rollouts_per_iteration": 10})
|
||||
result = trainer.train()
|
||||
assert result["episode_reward_mean"] > 10, result
|
||||
print("Test: OK")
|
||||
@@ -1,15 +0,0 @@
|
||||
"""Registry of algorithm names for `rllib train --run=<alg_name>`"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
|
||||
def _import_random_agent():
|
||||
from ray.rllib.contrib.random_agent.random_agent import RandomAgent
|
||||
return RandomAgent
|
||||
|
||||
|
||||
CONTRIBUTED_ALGORITHMS = {
|
||||
"contrib/RandomAgent": _import_random_agent,
|
||||
}
|
||||
Vendored
-11
@@ -1,11 +0,0 @@
|
||||
from ray.rllib.env.base_env import BaseEnv
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
from ray.rllib.env.external_env import ExternalEnv
|
||||
from ray.rllib.env.serving_env import ServingEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.env.env_context import EnvContext
|
||||
|
||||
__all__ = [
|
||||
"BaseEnv", "MultiAgentEnv", "ExternalEnv", "VectorEnv", "ServingEnv",
|
||||
"EnvContext"
|
||||
]
|
||||
-291
@@ -1,291 +0,0 @@
|
||||
import numpy as np
|
||||
from collections import deque
|
||||
import gym
|
||||
from gym import spaces
|
||||
import cv2
|
||||
cv2.ocl.setUseOpenCL(False)
|
||||
|
||||
|
||||
def is_atari(env):
|
||||
if (hasattr(env.observation_space, "shape")
|
||||
and env.observation_space.shape is not None
|
||||
and len(env.observation_space.shape) <= 2):
|
||||
return False
|
||||
return hasattr(env, "unwrapped") and hasattr(env.unwrapped, "ale")
|
||||
|
||||
|
||||
def get_wrapper_by_cls(env, cls):
|
||||
"""Returns the gym env wrapper of the given class, or None."""
|
||||
currentenv = env
|
||||
while True:
|
||||
if isinstance(currentenv, cls):
|
||||
return currentenv
|
||||
elif isinstance(currentenv, gym.Wrapper):
|
||||
currentenv = currentenv.env
|
||||
else:
|
||||
return None
|
||||
|
||||
|
||||
class MonitorEnv(gym.Wrapper):
|
||||
def __init__(self, env=None):
|
||||
"""Record episodes stats prior to EpisodicLifeEnv, etc."""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self._current_reward = None
|
||||
self._num_steps = None
|
||||
self._total_steps = None
|
||||
self._episode_rewards = []
|
||||
self._episode_lengths = []
|
||||
self._num_episodes = 0
|
||||
self._num_returned = 0
|
||||
|
||||
def reset(self, **kwargs):
|
||||
obs = self.env.reset(**kwargs)
|
||||
|
||||
if self._total_steps is None:
|
||||
self._total_steps = sum(self._episode_lengths)
|
||||
|
||||
if self._current_reward is not None:
|
||||
self._episode_rewards.append(self._current_reward)
|
||||
self._episode_lengths.append(self._num_steps)
|
||||
self._num_episodes += 1
|
||||
|
||||
self._current_reward = 0
|
||||
self._num_steps = 0
|
||||
|
||||
return obs
|
||||
|
||||
def step(self, action):
|
||||
obs, rew, done, info = self.env.step(action)
|
||||
self._current_reward += rew
|
||||
self._num_steps += 1
|
||||
self._total_steps += 1
|
||||
return (obs, rew, done, info)
|
||||
|
||||
def get_episode_rewards(self):
|
||||
return self._episode_rewards
|
||||
|
||||
def get_episode_lengths(self):
|
||||
return self._episode_lengths
|
||||
|
||||
def get_total_steps(self):
|
||||
return self._total_steps
|
||||
|
||||
def next_episode_results(self):
|
||||
for i in range(self._num_returned, len(self._episode_rewards)):
|
||||
yield (self._episode_rewards[i], self._episode_lengths[i])
|
||||
self._num_returned = len(self._episode_rewards)
|
||||
|
||||
|
||||
class NoopResetEnv(gym.Wrapper):
|
||||
def __init__(self, env, noop_max=30):
|
||||
"""Sample initial states by taking random number of no-ops on reset.
|
||||
No-op is assumed to be action 0.
|
||||
"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.noop_max = noop_max
|
||||
self.override_num_noops = None
|
||||
self.noop_action = 0
|
||||
assert env.unwrapped.get_action_meanings()[0] == "NOOP"
|
||||
|
||||
def reset(self, **kwargs):
|
||||
""" Do no-op action for a number of steps in [1, noop_max]."""
|
||||
self.env.reset(**kwargs)
|
||||
if self.override_num_noops is not None:
|
||||
noops = self.override_num_noops
|
||||
else:
|
||||
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
|
||||
assert noops > 0
|
||||
obs = None
|
||||
for _ in range(noops):
|
||||
obs, _, done, _ = self.env.step(self.noop_action)
|
||||
if done:
|
||||
obs = self.env.reset(**kwargs)
|
||||
return obs
|
||||
|
||||
def step(self, ac):
|
||||
return self.env.step(ac)
|
||||
|
||||
|
||||
class ClipRewardEnv(gym.RewardWrapper):
|
||||
def __init__(self, env):
|
||||
gym.RewardWrapper.__init__(self, env)
|
||||
|
||||
def reward(self, reward):
|
||||
"""Bin reward to {+1, 0, -1} by its sign."""
|
||||
return np.sign(reward)
|
||||
|
||||
|
||||
class FireResetEnv(gym.Wrapper):
|
||||
def __init__(self, env):
|
||||
"""Take action on reset.
|
||||
|
||||
For environments that are fixed until firing."""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
assert env.unwrapped.get_action_meanings()[1] == "FIRE"
|
||||
assert len(env.unwrapped.get_action_meanings()) >= 3
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self.env.reset(**kwargs)
|
||||
obs, _, done, _ = self.env.step(1)
|
||||
if done:
|
||||
self.env.reset(**kwargs)
|
||||
obs, _, done, _ = self.env.step(2)
|
||||
if done:
|
||||
self.env.reset(**kwargs)
|
||||
return obs
|
||||
|
||||
def step(self, ac):
|
||||
return self.env.step(ac)
|
||||
|
||||
|
||||
class EpisodicLifeEnv(gym.Wrapper):
|
||||
def __init__(self, env):
|
||||
"""Make end-of-life == end-of-episode, but only reset on true game over.
|
||||
Done by DeepMind for the DQN and co. since it helps value estimation.
|
||||
"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.lives = 0
|
||||
self.was_real_done = True
|
||||
|
||||
def step(self, action):
|
||||
obs, reward, done, info = self.env.step(action)
|
||||
self.was_real_done = done
|
||||
# check current lives, make loss of life terminal,
|
||||
# then update lives to handle bonus lives
|
||||
lives = self.env.unwrapped.ale.lives()
|
||||
if lives < self.lives and lives > 0:
|
||||
# for Qbert sometimes we stay in lives == 0 condtion for a few fr
|
||||
# so its important to keep lives > 0, so that we only reset once
|
||||
# the environment advertises done.
|
||||
done = True
|
||||
self.lives = lives
|
||||
return obs, reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
"""Reset only when lives are exhausted.
|
||||
This way all states are still reachable even though lives are episodic,
|
||||
and the learner need not know about any of this behind-the-scenes.
|
||||
"""
|
||||
if self.was_real_done:
|
||||
obs = self.env.reset(**kwargs)
|
||||
else:
|
||||
# no-op step to advance from terminal/lost life state
|
||||
obs, _, _, _ = self.env.step(0)
|
||||
self.lives = self.env.unwrapped.ale.lives()
|
||||
return obs
|
||||
|
||||
|
||||
class MaxAndSkipEnv(gym.Wrapper):
|
||||
def __init__(self, env, skip=4):
|
||||
"""Return only every `skip`-th frame"""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
# most recent raw observations (for max pooling across time steps)
|
||||
self._obs_buffer = np.zeros(
|
||||
(2, ) + env.observation_space.shape, dtype=np.uint8)
|
||||
self._skip = skip
|
||||
|
||||
def step(self, action):
|
||||
"""Repeat action, sum reward, and max over last observations."""
|
||||
total_reward = 0.0
|
||||
done = None
|
||||
for i in range(self._skip):
|
||||
obs, reward, done, info = self.env.step(action)
|
||||
if i == self._skip - 2:
|
||||
self._obs_buffer[0] = obs
|
||||
if i == self._skip - 1:
|
||||
self._obs_buffer[1] = obs
|
||||
total_reward += reward
|
||||
if done:
|
||||
break
|
||||
# Note that the observation on the done=True frame
|
||||
# doesn't matter
|
||||
max_frame = self._obs_buffer.max(axis=0)
|
||||
|
||||
return max_frame, total_reward, done, info
|
||||
|
||||
def reset(self, **kwargs):
|
||||
return self.env.reset(**kwargs)
|
||||
|
||||
|
||||
class WarpFrame(gym.ObservationWrapper):
|
||||
def __init__(self, env, dim):
|
||||
"""Warp frames to the specified size (dim x dim)."""
|
||||
gym.ObservationWrapper.__init__(self, env)
|
||||
self.width = dim
|
||||
self.height = dim
|
||||
self.observation_space = spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(self.height, self.width, 1),
|
||||
dtype=np.uint8)
|
||||
|
||||
def observation(self, frame):
|
||||
frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
|
||||
frame = cv2.resize(
|
||||
frame, (self.width, self.height), interpolation=cv2.INTER_AREA)
|
||||
return frame[:, :, None]
|
||||
|
||||
|
||||
class FrameStack(gym.Wrapper):
|
||||
def __init__(self, env, k):
|
||||
"""Stack k last frames."""
|
||||
gym.Wrapper.__init__(self, env)
|
||||
self.k = k
|
||||
self.frames = deque([], maxlen=k)
|
||||
shp = env.observation_space.shape
|
||||
self.observation_space = spaces.Box(
|
||||
low=0,
|
||||
high=255,
|
||||
shape=(shp[0], shp[1], shp[2] * k),
|
||||
dtype=env.observation_space.dtype)
|
||||
|
||||
def reset(self):
|
||||
ob = self.env.reset()
|
||||
for _ in range(self.k):
|
||||
self.frames.append(ob)
|
||||
return self._get_ob()
|
||||
|
||||
def step(self, action):
|
||||
ob, reward, done, info = self.env.step(action)
|
||||
self.frames.append(ob)
|
||||
return self._get_ob(), reward, done, info
|
||||
|
||||
def _get_ob(self):
|
||||
assert len(self.frames) == self.k
|
||||
return np.concatenate(self.frames, axis=2)
|
||||
|
||||
|
||||
class ScaledFloatFrame(gym.ObservationWrapper):
|
||||
def __init__(self, env):
|
||||
gym.ObservationWrapper.__init__(self, env)
|
||||
self.observation_space = gym.spaces.Box(
|
||||
low=0, high=1, shape=env.observation_space.shape, dtype=np.float32)
|
||||
|
||||
def observation(self, observation):
|
||||
# careful! This undoes the memory optimization, use
|
||||
# with smaller replay buffers only.
|
||||
return np.array(observation).astype(np.float32) / 255.0
|
||||
|
||||
|
||||
def wrap_deepmind(env, dim=84, framestack=True):
|
||||
"""Configure environment for DeepMind-style Atari.
|
||||
|
||||
Note that we assume reward clipping is done outside the wrapper.
|
||||
|
||||
Args:
|
||||
dim (int): Dimension to resize observations to (dim x dim).
|
||||
framestack (bool): Whether to framestack observations.
|
||||
"""
|
||||
env = MonitorEnv(env)
|
||||
env = NoopResetEnv(env, noop_max=30)
|
||||
if "NoFrameskip" in env.spec.id:
|
||||
env = MaxAndSkipEnv(env, skip=4)
|
||||
env = EpisodicLifeEnv(env)
|
||||
if "FIRE" in env.unwrapped.get_action_meanings():
|
||||
env = FireResetEnv(env)
|
||||
env = WarpFrame(env, dim)
|
||||
# env = ScaledFloatFrame(env) # TODO: use for dqn?
|
||||
# env = ClipRewardEnv(env) # reward clipping is handled by policy eval
|
||||
if framestack:
|
||||
env = FrameStack(env, 4)
|
||||
return env
|
||||
Vendored
-451
@@ -1,451 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.env.external_env import ExternalEnv
|
||||
from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
from ray.rllib.utils.annotations import override, PublicAPI
|
||||
|
||||
ASYNC_RESET_RETURN = "async_reset_return"
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class BaseEnv(object):
|
||||
"""The lowest-level env interface used by RLlib for sampling.
|
||||
|
||||
BaseEnv models multiple agents executing asynchronously in multiple
|
||||
environments. A call to poll() returns observations from ready agents
|
||||
keyed by their environment and agent ids, and actions for those agents
|
||||
can be sent back via send_actions().
|
||||
|
||||
All other env types can be adapted to BaseEnv. RLlib handles these
|
||||
conversions internally in RolloutWorker, for example:
|
||||
|
||||
gym.Env => rllib.VectorEnv => rllib.BaseEnv
|
||||
rllib.MultiAgentEnv => rllib.BaseEnv
|
||||
rllib.ExternalEnv => rllib.BaseEnv
|
||||
|
||||
Attributes:
|
||||
action_space (gym.Space): Action space. This must be defined for
|
||||
single-agent envs. Multi-agent envs can set this to None.
|
||||
observation_space (gym.Space): Observation space. This must be defined
|
||||
for single-agent envs. Multi-agent envs can set this to None.
|
||||
|
||||
Examples:
|
||||
>>> env = MyBaseEnv()
|
||||
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
|
||||
>>> print(obs)
|
||||
{
|
||||
"env_0": {
|
||||
"car_0": [2.4, 1.6],
|
||||
"car_1": [3.4, -3.2],
|
||||
},
|
||||
"env_1": {
|
||||
"car_0": [8.0, 4.1],
|
||||
},
|
||||
"env_2": {
|
||||
"car_0": [2.3, 3.3],
|
||||
"car_1": [1.4, -0.2],
|
||||
"car_3": [1.2, 0.1],
|
||||
},
|
||||
}
|
||||
>>> env.send_actions(
|
||||
actions={
|
||||
"env_0": {
|
||||
"car_0": 0,
|
||||
"car_1": 1,
|
||||
}, ...
|
||||
})
|
||||
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
|
||||
>>> print(obs)
|
||||
{
|
||||
"env_0": {
|
||||
"car_0": [4.1, 1.7],
|
||||
"car_1": [3.2, -4.2],
|
||||
}, ...
|
||||
}
|
||||
>>> print(dones)
|
||||
{
|
||||
"env_0": {
|
||||
"__all__": False,
|
||||
"car_0": False,
|
||||
"car_1": True,
|
||||
}, ...
|
||||
}
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def to_base_env(env,
|
||||
make_env=None,
|
||||
num_envs=1,
|
||||
remote_envs=False,
|
||||
remote_env_batch_wait_ms=0):
|
||||
"""Wraps any env type as needed to expose the async interface."""
|
||||
|
||||
from ray.rllib.env.remote_vector_env import RemoteVectorEnv
|
||||
if remote_envs and num_envs == 1:
|
||||
raise ValueError(
|
||||
"Remote envs only make sense to use if num_envs > 1 "
|
||||
"(i.e. vectorization is enabled).")
|
||||
|
||||
if not isinstance(env, BaseEnv):
|
||||
if isinstance(env, MultiAgentEnv):
|
||||
if remote_envs:
|
||||
env = RemoteVectorEnv(
|
||||
make_env,
|
||||
num_envs,
|
||||
multiagent=True,
|
||||
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
|
||||
else:
|
||||
env = _MultiAgentEnvToBaseEnv(
|
||||
make_env=make_env,
|
||||
existing_envs=[env],
|
||||
num_envs=num_envs)
|
||||
elif isinstance(env, ExternalMultiAgentEnv):
|
||||
if num_envs != 1:
|
||||
raise ValueError(
|
||||
"ExternalMultiAgentEnv does not currently support "
|
||||
"num_envs > 1.")
|
||||
env = _ExternalEnvToBaseEnv(env, multiagent=True)
|
||||
elif isinstance(env, ExternalEnv):
|
||||
if num_envs != 1:
|
||||
raise ValueError(
|
||||
"ExternalEnv does not currently support num_envs > 1.")
|
||||
env = _ExternalEnvToBaseEnv(env)
|
||||
elif isinstance(env, VectorEnv):
|
||||
env = _VectorEnvToBaseEnv(env)
|
||||
else:
|
||||
if remote_envs:
|
||||
env = RemoteVectorEnv(
|
||||
make_env,
|
||||
num_envs,
|
||||
multiagent=False,
|
||||
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
|
||||
else:
|
||||
env = VectorEnv.wrap(
|
||||
make_env=make_env,
|
||||
existing_envs=[env],
|
||||
num_envs=num_envs,
|
||||
action_space=env.action_space,
|
||||
observation_space=env.observation_space)
|
||||
env = _VectorEnvToBaseEnv(env)
|
||||
assert isinstance(env, BaseEnv), env
|
||||
return env
|
||||
|
||||
@PublicAPI
|
||||
def poll(self):
|
||||
"""Returns observations from ready agents.
|
||||
|
||||
The returns are two-level dicts mapping from env_id to a dict of
|
||||
agent_id to values. The number of agents and envs can vary over time.
|
||||
|
||||
Returns
|
||||
-------
|
||||
obs (dict): New observations for each ready agent.
|
||||
rewards (dict): Reward values for each ready agent. If the
|
||||
episode is just started, the value will be None.
|
||||
dones (dict): Done values for each ready agent. The special key
|
||||
"__all__" is used to indicate env termination.
|
||||
infos (dict): Info values for each ready agent.
|
||||
off_policy_actions (dict): Agents may take off-policy actions. When
|
||||
that happens, there will be an entry in this dict that contains
|
||||
the taken action. There is no need to send_actions() for agents
|
||||
that have already chosen off-policy actions.
|
||||
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def send_actions(self, action_dict):
|
||||
"""Called to send actions back to running agents in this env.
|
||||
|
||||
Actions should be sent for each ready agent that returned observations
|
||||
in the previous poll() call.
|
||||
|
||||
Arguments:
|
||||
action_dict (dict): Actions values keyed by env_id and agent_id.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def try_reset(self, env_id):
|
||||
"""Attempt to reset the env with the given id.
|
||||
|
||||
If the environment does not support synchronous reset, None can be
|
||||
returned here.
|
||||
|
||||
Returns:
|
||||
obs (dict|None): Resetted observation or None if not supported.
|
||||
"""
|
||||
return None
|
||||
|
||||
@PublicAPI
|
||||
def get_unwrapped(self):
|
||||
"""Return a reference to the underlying gym envs, if any.
|
||||
|
||||
Returns:
|
||||
envs (list): Underlying gym envs or [].
|
||||
"""
|
||||
return []
|
||||
|
||||
@PublicAPI
|
||||
def stop(self):
|
||||
"""Releases all resources used."""
|
||||
|
||||
for env in self.get_unwrapped():
|
||||
if hasattr(env, "close"):
|
||||
env.close()
|
||||
|
||||
|
||||
# Fixed agent identifier when there is only the single agent in the env
|
||||
_DUMMY_AGENT_ID = "agent0"
|
||||
|
||||
|
||||
def _with_dummy_agent_id(env_id_to_values, dummy_id=_DUMMY_AGENT_ID):
|
||||
return {k: {dummy_id: v} for (k, v) in env_id_to_values.items()}
|
||||
|
||||
|
||||
class _ExternalEnvToBaseEnv(BaseEnv):
|
||||
"""Internal adapter of ExternalEnv to BaseEnv."""
|
||||
|
||||
def __init__(self, external_env, preprocessor=None, multiagent=False):
|
||||
self.external_env = external_env
|
||||
self.prep = preprocessor
|
||||
self.multiagent = multiagent
|
||||
self.action_space = external_env.action_space
|
||||
if preprocessor:
|
||||
self.observation_space = preprocessor.observation_space
|
||||
else:
|
||||
self.observation_space = external_env.observation_space
|
||||
external_env.start()
|
||||
|
||||
@override(BaseEnv)
|
||||
def poll(self):
|
||||
with self.external_env._results_avail_condition:
|
||||
results = self._poll()
|
||||
while len(results[0]) == 0:
|
||||
self.external_env._results_avail_condition.wait()
|
||||
results = self._poll()
|
||||
if not self.external_env.isAlive():
|
||||
raise Exception("Serving thread has stopped.")
|
||||
limit = self.external_env._max_concurrent_episodes
|
||||
assert len(results[0]) < limit, \
|
||||
("Too many concurrent episodes, were some leaked? This "
|
||||
"ExternalEnv was created with max_concurrent={}".format(limit))
|
||||
return results
|
||||
|
||||
@override(BaseEnv)
|
||||
def send_actions(self, action_dict):
|
||||
if self.multiagent:
|
||||
for env_id, actions in action_dict.items():
|
||||
self.external_env._episodes[env_id].action_queue.put(actions)
|
||||
else:
|
||||
for env_id, action in action_dict.items():
|
||||
self.external_env._episodes[env_id].action_queue.put(
|
||||
action[_DUMMY_AGENT_ID])
|
||||
|
||||
def _poll(self):
|
||||
all_obs, all_rewards, all_dones, all_infos = {}, {}, {}, {}
|
||||
off_policy_actions = {}
|
||||
for eid, episode in self.external_env._episodes.copy().items():
|
||||
data = episode.get_data()
|
||||
cur_done = episode.cur_done_dict[
|
||||
"__all__"] if self.multiagent else episode.cur_done
|
||||
if cur_done:
|
||||
del self.external_env._episodes[eid]
|
||||
if data:
|
||||
if self.prep:
|
||||
all_obs[eid] = self.prep.transform(data["obs"])
|
||||
else:
|
||||
all_obs[eid] = data["obs"]
|
||||
all_rewards[eid] = data["reward"]
|
||||
all_dones[eid] = data["done"]
|
||||
all_infos[eid] = data["info"]
|
||||
if "off_policy_action" in data:
|
||||
off_policy_actions[eid] = data["off_policy_action"]
|
||||
if self.multiagent:
|
||||
# ensure a consistent set of keys
|
||||
# rely on all_obs having all possible keys for now
|
||||
for eid, eid_dict in all_obs.items():
|
||||
for agent_id in eid_dict.keys():
|
||||
|
||||
def fix(d, zero_val):
|
||||
if agent_id not in d[eid]:
|
||||
d[eid][agent_id] = zero_val
|
||||
|
||||
fix(all_rewards, 0.0)
|
||||
fix(all_dones, False)
|
||||
fix(all_infos, {})
|
||||
return (all_obs, all_rewards, all_dones, all_infos,
|
||||
off_policy_actions)
|
||||
else:
|
||||
return _with_dummy_agent_id(all_obs), \
|
||||
_with_dummy_agent_id(all_rewards), \
|
||||
_with_dummy_agent_id(all_dones, "__all__"), \
|
||||
_with_dummy_agent_id(all_infos), \
|
||||
_with_dummy_agent_id(off_policy_actions)
|
||||
|
||||
|
||||
class _VectorEnvToBaseEnv(BaseEnv):
|
||||
"""Internal adapter of VectorEnv to BaseEnv.
|
||||
|
||||
We assume the caller will always send the full vector of actions in each
|
||||
call to send_actions(), and that they call reset_at() on all completed
|
||||
environments before calling send_actions().
|
||||
"""
|
||||
|
||||
def __init__(self, vector_env):
|
||||
self.vector_env = vector_env
|
||||
self.action_space = vector_env.action_space
|
||||
self.observation_space = vector_env.observation_space
|
||||
self.num_envs = vector_env.num_envs
|
||||
self.new_obs = None # lazily initialized
|
||||
self.cur_rewards = [None for _ in range(self.num_envs)]
|
||||
self.cur_dones = [False for _ in range(self.num_envs)]
|
||||
self.cur_infos = [None for _ in range(self.num_envs)]
|
||||
|
||||
@override(BaseEnv)
|
||||
def poll(self):
|
||||
if self.new_obs is None:
|
||||
self.new_obs = self.vector_env.vector_reset()
|
||||
new_obs = dict(enumerate(self.new_obs))
|
||||
rewards = dict(enumerate(self.cur_rewards))
|
||||
dones = dict(enumerate(self.cur_dones))
|
||||
infos = dict(enumerate(self.cur_infos))
|
||||
self.new_obs = []
|
||||
self.cur_rewards = []
|
||||
self.cur_dones = []
|
||||
self.cur_infos = []
|
||||
return _with_dummy_agent_id(new_obs), \
|
||||
_with_dummy_agent_id(rewards), \
|
||||
_with_dummy_agent_id(dones, "__all__"), \
|
||||
_with_dummy_agent_id(infos), {}
|
||||
|
||||
@override(BaseEnv)
|
||||
def send_actions(self, action_dict):
|
||||
action_vector = [None] * self.num_envs
|
||||
for i in range(self.num_envs):
|
||||
action_vector[i] = action_dict[i][_DUMMY_AGENT_ID]
|
||||
self.new_obs, self.cur_rewards, self.cur_dones, self.cur_infos = \
|
||||
self.vector_env.vector_step(action_vector)
|
||||
|
||||
@override(BaseEnv)
|
||||
def try_reset(self, env_id):
|
||||
return {_DUMMY_AGENT_ID: self.vector_env.reset_at(env_id)}
|
||||
|
||||
@override(BaseEnv)
|
||||
def get_unwrapped(self):
|
||||
return self.vector_env.get_unwrapped()
|
||||
|
||||
|
||||
class _MultiAgentEnvToBaseEnv(BaseEnv):
|
||||
"""Internal adapter of MultiAgentEnv to BaseEnv.
|
||||
|
||||
This also supports vectorization if num_envs > 1.
|
||||
"""
|
||||
|
||||
def __init__(self, make_env, existing_envs, num_envs):
|
||||
"""Wrap existing multi-agent envs.
|
||||
|
||||
Arguments:
|
||||
make_env (func|None): Factory that produces a new multiagent env.
|
||||
Must be defined if the number of existing envs is less than
|
||||
num_envs.
|
||||
existing_envs (list): List of existing multiagent envs.
|
||||
num_envs (int): Desired num multiagent envs to keep total.
|
||||
"""
|
||||
self.make_env = make_env
|
||||
self.envs = existing_envs
|
||||
self.num_envs = num_envs
|
||||
self.dones = set()
|
||||
while len(self.envs) < self.num_envs:
|
||||
self.envs.append(self.make_env(len(self.envs)))
|
||||
for env in self.envs:
|
||||
assert isinstance(env, MultiAgentEnv)
|
||||
self.env_states = [_MultiAgentEnvState(env) for env in self.envs]
|
||||
|
||||
@override(BaseEnv)
|
||||
def poll(self):
|
||||
obs, rewards, dones, infos = {}, {}, {}, {}
|
||||
for i, env_state in enumerate(self.env_states):
|
||||
obs[i], rewards[i], dones[i], infos[i] = env_state.poll()
|
||||
return obs, rewards, dones, infos, {}
|
||||
|
||||
@override(BaseEnv)
|
||||
def send_actions(self, action_dict):
|
||||
for env_id, agent_dict in action_dict.items():
|
||||
if env_id in self.dones:
|
||||
raise ValueError("Env {} is already done".format(env_id))
|
||||
env = self.envs[env_id]
|
||||
obs, rewards, dones, infos = env.step(agent_dict)
|
||||
assert isinstance(obs, dict), "Not a multi-agent obs"
|
||||
assert isinstance(rewards, dict), "Not a multi-agent reward"
|
||||
assert isinstance(dones, dict), "Not a multi-agent return"
|
||||
assert isinstance(infos, dict), "Not a multi-agent info"
|
||||
if set(obs.keys()) != set(rewards.keys()):
|
||||
raise ValueError(
|
||||
"Key set for obs and rewards must be the same: "
|
||||
"{} vs {}".format(obs.keys(), rewards.keys()))
|
||||
if set(infos).difference(set(obs)):
|
||||
raise ValueError("Key set for infos must be a subset of obs: "
|
||||
"{} vs {}".format(infos.keys(), obs.keys()))
|
||||
if "__all__" not in dones:
|
||||
raise ValueError(
|
||||
"In multi-agent environments, '__all__': True|False must "
|
||||
"be included in the 'done' dict: got {}.".format(dones))
|
||||
if dones["__all__"]:
|
||||
self.dones.add(env_id)
|
||||
self.env_states[env_id].observe(obs, rewards, dones, infos)
|
||||
|
||||
@override(BaseEnv)
|
||||
def try_reset(self, env_id):
|
||||
obs = self.env_states[env_id].reset()
|
||||
assert isinstance(obs, dict), "Not a multi-agent obs"
|
||||
if obs is not None and env_id in self.dones:
|
||||
self.dones.remove(env_id)
|
||||
return obs
|
||||
|
||||
@override(BaseEnv)
|
||||
def get_unwrapped(self):
|
||||
return [state.env for state in self.env_states]
|
||||
|
||||
|
||||
class _MultiAgentEnvState(object):
|
||||
def __init__(self, env):
|
||||
assert isinstance(env, MultiAgentEnv)
|
||||
self.env = env
|
||||
self.initialized = False
|
||||
|
||||
def poll(self):
|
||||
if not self.initialized:
|
||||
self.reset()
|
||||
self.initialized = True
|
||||
obs, rew, dones, info = (self.last_obs, self.last_rewards,
|
||||
self.last_dones, self.last_infos)
|
||||
self.last_obs = {}
|
||||
self.last_rewards = {}
|
||||
self.last_dones = {"__all__": False}
|
||||
self.last_infos = {}
|
||||
return obs, rew, dones, info
|
||||
|
||||
def observe(self, obs, rewards, dones, infos):
|
||||
self.last_obs = obs
|
||||
self.last_rewards = rewards
|
||||
self.last_dones = dones
|
||||
self.last_infos = infos
|
||||
|
||||
def reset(self):
|
||||
self.last_obs = self.env.reset()
|
||||
self.last_rewards = {
|
||||
agent_id: None
|
||||
for agent_id in self.last_obs.keys()
|
||||
}
|
||||
self.last_dones = {
|
||||
agent_id: False
|
||||
for agent_id in self.last_obs.keys()
|
||||
}
|
||||
self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
|
||||
self.last_dones["__all__"] = False
|
||||
return self.last_obs
|
||||
Vendored
-19
@@ -1,19 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
# info key for the individual rewards of an agent, for example:
|
||||
# info: {
|
||||
# group_1: {
|
||||
# _group_rewards: [5, -1, 1], # 3 agents in this group
|
||||
# }
|
||||
# }
|
||||
GROUP_REWARDS = "_group_rewards"
|
||||
|
||||
# info key for the individual infos of an agent, for example:
|
||||
# info: {
|
||||
# group_1: {
|
||||
# _group_infos: [{"foo": ...}, {}], # 2 agents in this group
|
||||
# }
|
||||
# }
|
||||
GROUP_INFO = "_group_info"
|
||||
Vendored
-42
@@ -1,42 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.utils.annotations import PublicAPI
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class EnvContext(dict):
|
||||
"""Wraps env configurations to include extra rllib metadata.
|
||||
|
||||
These attributes can be used to parameterize environments per process.
|
||||
For example, one might use `worker_index` to control which data file an
|
||||
environment reads in on initialization.
|
||||
|
||||
RLlib auto-sets these attributes when constructing registered envs.
|
||||
|
||||
Attributes:
|
||||
worker_index (int): When there are multiple workers created, this
|
||||
uniquely identifies the worker the env is created in.
|
||||
vector_index (int): When there are multiple envs per worker, this
|
||||
uniquely identifies the env index within the worker.
|
||||
remote (bool): Whether environment should be remote or not.
|
||||
"""
|
||||
|
||||
def __init__(self, env_config, worker_index, vector_index=0, remote=False):
|
||||
dict.__init__(self, env_config)
|
||||
self.worker_index = worker_index
|
||||
self.vector_index = vector_index
|
||||
self.remote = remote
|
||||
|
||||
def copy_with_overrides(self,
|
||||
env_config=None,
|
||||
worker_index=None,
|
||||
vector_index=None,
|
||||
remote=None):
|
||||
return EnvContext(
|
||||
env_config if env_config is not None else self,
|
||||
worker_index if worker_index is not None else self.worker_index,
|
||||
vector_index if vector_index is not None else self.vector_index,
|
||||
remote if remote is not None else self.remote,
|
||||
)
|
||||
Vendored
-272
@@ -1,272 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from six.moves import queue
|
||||
import threading
|
||||
import uuid
|
||||
|
||||
from ray.rllib.utils.annotations import PublicAPI
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class ExternalEnv(threading.Thread):
|
||||
"""An environment that interfaces with external agents.
|
||||
|
||||
Unlike simulator envs, control is inverted. The environment queries the
|
||||
policy to obtain actions and logs observations and rewards for training.
|
||||
This is in contrast to gym.Env, where the algorithm drives the simulation
|
||||
through env.step() calls.
|
||||
|
||||
You can use ExternalEnv as the backend for policy serving (by serving HTTP
|
||||
requests in the run loop), for ingesting offline logs data (by reading
|
||||
offline transitions in the run loop), or other custom use cases not easily
|
||||
expressed through gym.Env.
|
||||
|
||||
ExternalEnv supports both on-policy actions (through self.get_action()),
|
||||
and off-policy actions (through self.log_action()).
|
||||
|
||||
This env is thread-safe, but individual episodes must be executed serially.
|
||||
|
||||
Attributes:
|
||||
action_space (gym.Space): Action space.
|
||||
observation_space (gym.Space): Observation space.
|
||||
|
||||
Examples:
|
||||
>>> register_env("my_env", lambda config: YourExternalEnv(config))
|
||||
>>> trainer = DQNTrainer(env="my_env")
|
||||
>>> while True:
|
||||
print(trainer.train())
|
||||
"""
|
||||
|
||||
@PublicAPI
|
||||
def __init__(self, action_space, observation_space, max_concurrent=100):
|
||||
"""Initialize an external env.
|
||||
|
||||
ExternalEnv subclasses must call this during their __init__.
|
||||
|
||||
Arguments:
|
||||
action_space (gym.Space): Action space of the env.
|
||||
observation_space (gym.Space): Observation space of the env.
|
||||
max_concurrent (int): Max number of active episodes to allow at
|
||||
once. Exceeding this limit raises an error.
|
||||
"""
|
||||
|
||||
threading.Thread.__init__(self)
|
||||
self.daemon = True
|
||||
self.action_space = action_space
|
||||
self.observation_space = observation_space
|
||||
self._episodes = {}
|
||||
self._finished = set()
|
||||
self._results_avail_condition = threading.Condition()
|
||||
self._max_concurrent_episodes = max_concurrent
|
||||
|
||||
@PublicAPI
|
||||
def run(self):
|
||||
"""Override this to implement the run loop.
|
||||
|
||||
Your loop should continuously:
|
||||
1. Call self.start_episode(episode_id)
|
||||
2. Call self.get_action(episode_id, obs)
|
||||
-or-
|
||||
self.log_action(episode_id, obs, action)
|
||||
3. Call self.log_returns(episode_id, reward)
|
||||
4. Call self.end_episode(episode_id, obs)
|
||||
5. Wait if nothing to do.
|
||||
|
||||
Multiple episodes may be started at the same time.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def start_episode(self, episode_id=None, training_enabled=True):
|
||||
"""Record the start of an episode.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Unique string id for the episode or None for
|
||||
it to be auto-assigned.
|
||||
training_enabled (bool): Whether to use experiences for this
|
||||
episode to improve the policy.
|
||||
|
||||
Returns:
|
||||
episode_id (str): Unique string id for the episode.
|
||||
"""
|
||||
|
||||
if episode_id is None:
|
||||
episode_id = uuid.uuid4().hex
|
||||
|
||||
if episode_id in self._finished:
|
||||
raise ValueError(
|
||||
"Episode {} has already completed.".format(episode_id))
|
||||
|
||||
if episode_id in self._episodes:
|
||||
raise ValueError(
|
||||
"Episode {} is already started".format(episode_id))
|
||||
|
||||
self._episodes[episode_id] = _ExternalEnvEpisode(
|
||||
episode_id, self._results_avail_condition, training_enabled)
|
||||
|
||||
return episode_id
|
||||
|
||||
@PublicAPI
|
||||
def get_action(self, episode_id, observation):
|
||||
"""Record an observation and get the on-policy action.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
observation (obj): Current environment observation.
|
||||
|
||||
Returns:
|
||||
action (obj): Action from the env action space.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
return episode.wait_for_action(observation)
|
||||
|
||||
@PublicAPI
|
||||
def log_action(self, episode_id, observation, action):
|
||||
"""Record an observation and (off-policy) action taken.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
observation (obj): Current environment observation.
|
||||
action (obj): Action for the observation.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
episode.log_action(observation, action)
|
||||
|
||||
@PublicAPI
|
||||
def log_returns(self, episode_id, reward, info=None):
|
||||
"""Record returns from the environment.
|
||||
|
||||
The reward will be attributed to the previous action taken by the
|
||||
episode. Rewards accumulate until the next action. If no reward is
|
||||
logged before the next action, a reward of 0.0 is assumed.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
reward (float): Reward from the environment.
|
||||
info (dict): Optional info dict.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
episode.cur_reward += reward
|
||||
if info:
|
||||
episode.cur_info = info or {}
|
||||
|
||||
@PublicAPI
|
||||
def end_episode(self, episode_id, observation):
|
||||
"""Record the end of an episode.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
observation (obj): Current environment observation.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
self._finished.add(episode.episode_id)
|
||||
episode.done(observation)
|
||||
|
||||
def _get(self, episode_id):
|
||||
"""Get a started episode or raise an error."""
|
||||
|
||||
if episode_id in self._finished:
|
||||
raise ValueError(
|
||||
"Episode {} has already completed.".format(episode_id))
|
||||
|
||||
if episode_id not in self._episodes:
|
||||
raise ValueError("Episode {} not found.".format(episode_id))
|
||||
|
||||
return self._episodes[episode_id]
|
||||
|
||||
|
||||
class _ExternalEnvEpisode(object):
|
||||
"""Tracked state for each active episode."""
|
||||
|
||||
def __init__(self,
|
||||
episode_id,
|
||||
results_avail_condition,
|
||||
training_enabled,
|
||||
multiagent=False):
|
||||
self.episode_id = episode_id
|
||||
self.results_avail_condition = results_avail_condition
|
||||
self.training_enabled = training_enabled
|
||||
self.multiagent = multiagent
|
||||
self.data_queue = queue.Queue()
|
||||
self.action_queue = queue.Queue()
|
||||
if multiagent:
|
||||
self.new_observation_dict = None
|
||||
self.new_action_dict = None
|
||||
self.cur_reward_dict = {}
|
||||
self.cur_done_dict = {"__all__": False}
|
||||
self.cur_info_dict = {}
|
||||
else:
|
||||
self.new_observation = None
|
||||
self.new_action = None
|
||||
self.cur_reward = 0.0
|
||||
self.cur_done = False
|
||||
self.cur_info = {}
|
||||
|
||||
def get_data(self):
|
||||
if self.data_queue.empty():
|
||||
return None
|
||||
return self.data_queue.get_nowait()
|
||||
|
||||
def log_action(self, observation, action):
|
||||
if self.multiagent:
|
||||
self.new_observation_dict = observation
|
||||
self.new_action_dict = action
|
||||
else:
|
||||
self.new_observation = observation
|
||||
self.new_action = action
|
||||
self._send()
|
||||
self.action_queue.get(True, timeout=60.0)
|
||||
|
||||
def wait_for_action(self, observation):
|
||||
if self.multiagent:
|
||||
self.new_observation_dict = observation
|
||||
else:
|
||||
self.new_observation = observation
|
||||
self._send()
|
||||
return self.action_queue.get(True, timeout=60.0)
|
||||
|
||||
def done(self, observation):
|
||||
if self.multiagent:
|
||||
self.new_observation_dict = observation
|
||||
self.cur_done_dict = {"__all__": True}
|
||||
else:
|
||||
self.new_observation = observation
|
||||
self.cur_done = True
|
||||
self._send()
|
||||
|
||||
def _send(self):
|
||||
if self.multiagent:
|
||||
item = {
|
||||
"obs": self.new_observation_dict,
|
||||
"reward": self.cur_reward_dict,
|
||||
"done": self.cur_done_dict,
|
||||
"info": self.cur_info_dict,
|
||||
}
|
||||
if self.new_action_dict is not None:
|
||||
item["off_policy_action"] = self.new_action_dict
|
||||
self.new_observation_dict = None
|
||||
self.new_action_dict = None
|
||||
self.cur_reward_dict = {}
|
||||
else:
|
||||
item = {
|
||||
"obs": self.new_observation,
|
||||
"reward": self.cur_reward,
|
||||
"done": self.cur_done,
|
||||
"info": self.cur_info,
|
||||
}
|
||||
if self.new_action is not None:
|
||||
item["off_policy_action"] = self.new_action
|
||||
self.new_observation = None
|
||||
self.new_action = None
|
||||
self.cur_reward = 0.0
|
||||
if not self.training_enabled:
|
||||
item["info"]["training_enabled"] = False
|
||||
with self.results_avail_condition:
|
||||
self.data_queue.put_nowait(item)
|
||||
self.results_avail_condition.notify()
|
||||
-149
@@ -1,149 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import uuid
|
||||
|
||||
from ray.rllib.utils.annotations import override, PublicAPI
|
||||
from ray.rllib.env.external_env import ExternalEnv, _ExternalEnvEpisode
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class ExternalMultiAgentEnv(ExternalEnv):
|
||||
"""This is the multi-agent version of ExternalEnv."""
|
||||
|
||||
@PublicAPI
|
||||
def __init__(self, action_space, observation_space, max_concurrent=100):
|
||||
"""Initialize a multi-agent external env.
|
||||
|
||||
ExternalMultiAgentEnv subclasses must call this during their __init__.
|
||||
|
||||
Arguments:
|
||||
action_space (gym.Space): Action space of the env.
|
||||
observation_space (gym.Space): Observation space of the env.
|
||||
max_concurrent (int): Max number of active episodes to allow at
|
||||
once. Exceeding this limit raises an error.
|
||||
"""
|
||||
ExternalEnv.__init__(self, action_space, observation_space,
|
||||
max_concurrent)
|
||||
|
||||
# we require to know all agents' spaces
|
||||
if isinstance(self.action_space, dict) or isinstance(
|
||||
self.observation_space, dict):
|
||||
if not (self.action_space.keys() == self.observation_space.keys()):
|
||||
raise ValueError("Agent ids disagree for action space and obs "
|
||||
"space dict: {} {}".format(
|
||||
self.action_space.keys(),
|
||||
self.observation_space.keys()))
|
||||
|
||||
@PublicAPI
|
||||
def run(self):
|
||||
"""Override this to implement the multi-agent run loop.
|
||||
|
||||
Your loop should continuously:
|
||||
1. Call self.start_episode(episode_id)
|
||||
2. Call self.get_action(episode_id, obs_dict)
|
||||
-or-
|
||||
self.log_action(episode_id, obs_dict, action_dict)
|
||||
3. Call self.log_returns(episode_id, reward_dict)
|
||||
4. Call self.end_episode(episode_id, obs_dict)
|
||||
5. Wait if nothing to do.
|
||||
|
||||
Multiple episodes may be started at the same time.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
@override(ExternalEnv)
|
||||
def start_episode(self, episode_id=None, training_enabled=True):
|
||||
if episode_id is None:
|
||||
episode_id = uuid.uuid4().hex
|
||||
|
||||
if episode_id in self._finished:
|
||||
raise ValueError(
|
||||
"Episode {} has already completed.".format(episode_id))
|
||||
|
||||
if episode_id in self._episodes:
|
||||
raise ValueError(
|
||||
"Episode {} is already started".format(episode_id))
|
||||
|
||||
self._episodes[episode_id] = _ExternalEnvEpisode(
|
||||
episode_id,
|
||||
self._results_avail_condition,
|
||||
training_enabled,
|
||||
multiagent=True)
|
||||
|
||||
return episode_id
|
||||
|
||||
@PublicAPI
|
||||
@override(ExternalEnv)
|
||||
def get_action(self, episode_id, observation_dict):
|
||||
"""Record an observation and get the on-policy action.
|
||||
observation_dict is expected to contain the observation
|
||||
of all agents acting in this episode step.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
observation_dict (dict): Current environment observation.
|
||||
|
||||
Returns:
|
||||
action (dict): Action from the env action space.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
return episode.wait_for_action(observation_dict)
|
||||
|
||||
@PublicAPI
|
||||
@override(ExternalEnv)
|
||||
def log_action(self, episode_id, observation_dict, action_dict):
|
||||
"""Record an observation and (off-policy) action taken.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
observation_dict (dict): Current environment observation.
|
||||
action_dict (dict): Action for the observation.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
episode.log_action(observation_dict, action_dict)
|
||||
|
||||
@PublicAPI
|
||||
@override(ExternalEnv)
|
||||
def log_returns(self, episode_id, reward_dict, info_dict=None):
|
||||
"""Record returns from the environment.
|
||||
|
||||
The reward will be attributed to the previous action taken by the
|
||||
episode. Rewards accumulate until the next action. If no reward is
|
||||
logged before the next action, a reward of 0.0 is assumed.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
reward_dict (dict): Reward from the environment agents.
|
||||
info (dict): Optional info dict.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
|
||||
# accumulate reward by agent
|
||||
# for existing agents, we want to add the reward up
|
||||
for agent, rew in reward_dict.items():
|
||||
if agent in episode.cur_reward_dict:
|
||||
episode.cur_reward_dict[agent] += rew
|
||||
else:
|
||||
episode.cur_reward_dict[agent] = rew
|
||||
if info_dict:
|
||||
episode.cur_info_dict = info_dict or {}
|
||||
|
||||
@PublicAPI
|
||||
@override(ExternalEnv)
|
||||
def end_episode(self, episode_id, observation_dict):
|
||||
"""Record the end of an episode.
|
||||
|
||||
Arguments:
|
||||
episode_id (str): Episode id returned from start_episode().
|
||||
observation_dict (dict): Current environment observation.
|
||||
"""
|
||||
|
||||
episode = self._get(episode_id)
|
||||
self._finished.add(episode.episode_id)
|
||||
episode.done(observation_dict)
|
||||
-107
@@ -1,107 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
from ray.rllib.env.constants import GROUP_REWARDS, GROUP_INFO
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
|
||||
|
||||
# TODO(ekl) we should add some unit tests for this
|
||||
class _GroupAgentsWrapper(MultiAgentEnv):
|
||||
"""Wraps a MultiAgentEnv environment with agents grouped as specified.
|
||||
|
||||
See multi_agent_env.py for the specification of groups.
|
||||
|
||||
This API is experimental.
|
||||
"""
|
||||
|
||||
def __init__(self, env, groups, obs_space=None, act_space=None):
|
||||
"""Wrap an existing multi-agent env to group agents together.
|
||||
|
||||
See MultiAgentEnv.with_agent_groups() for usage info.
|
||||
|
||||
Arguments:
|
||||
env (MultiAgentEnv): env to wrap
|
||||
groups (dict): Grouping spec as documented in MultiAgentEnv
|
||||
obs_space (Space): Optional observation space for the grouped
|
||||
env. Must be a tuple space.
|
||||
act_space (Space): Optional action space for the grouped env.
|
||||
Must be a tuple space.
|
||||
"""
|
||||
|
||||
self.env = env
|
||||
self.groups = groups
|
||||
self.agent_id_to_group = {}
|
||||
for group_id, agent_ids in groups.items():
|
||||
for agent_id in agent_ids:
|
||||
if agent_id in self.agent_id_to_group:
|
||||
raise ValueError(
|
||||
"Agent id {} is in multiple groups".format(
|
||||
agent_id, groups))
|
||||
self.agent_id_to_group[agent_id] = group_id
|
||||
if obs_space is not None:
|
||||
self.observation_space = obs_space
|
||||
if act_space is not None:
|
||||
self.action_space = act_space
|
||||
|
||||
def reset(self):
|
||||
obs = self.env.reset()
|
||||
return self._group_items(obs)
|
||||
|
||||
def step(self, action_dict):
|
||||
# Ungroup and send actions
|
||||
action_dict = self._ungroup_items(action_dict)
|
||||
obs, rewards, dones, infos = self.env.step(action_dict)
|
||||
|
||||
# Apply grouping transforms to the env outputs
|
||||
obs = self._group_items(obs)
|
||||
rewards = self._group_items(
|
||||
rewards, agg_fn=lambda gvals: list(gvals.values()))
|
||||
dones = self._group_items(
|
||||
dones, agg_fn=lambda gvals: all(gvals.values()))
|
||||
infos = self._group_items(
|
||||
infos, agg_fn=lambda gvals: {GROUP_INFO: list(gvals.values())})
|
||||
|
||||
# Aggregate rewards, but preserve the original values in infos
|
||||
for agent_id, rew in rewards.items():
|
||||
if isinstance(rew, list):
|
||||
rewards[agent_id] = sum(rew)
|
||||
if agent_id not in infos:
|
||||
infos[agent_id] = {}
|
||||
infos[agent_id][GROUP_REWARDS] = rew
|
||||
|
||||
return obs, rewards, dones, infos
|
||||
|
||||
def _ungroup_items(self, items):
|
||||
out = {}
|
||||
for agent_id, value in items.items():
|
||||
if agent_id in self.groups:
|
||||
assert len(value) == len(self.groups[agent_id]), \
|
||||
(agent_id, value, self.groups)
|
||||
for a, v in zip(self.groups[agent_id], value):
|
||||
out[a] = v
|
||||
else:
|
||||
out[agent_id] = value
|
||||
return out
|
||||
|
||||
def _group_items(self, items, agg_fn=lambda gvals: list(gvals.values())):
|
||||
grouped_items = {}
|
||||
for agent_id, item in items.items():
|
||||
if agent_id in self.agent_id_to_group:
|
||||
group_id = self.agent_id_to_group[agent_id]
|
||||
if group_id in grouped_items:
|
||||
continue # already added
|
||||
group_out = OrderedDict()
|
||||
for a in self.groups[group_id]:
|
||||
if a in items:
|
||||
group_out[a] = items[a]
|
||||
else:
|
||||
raise ValueError(
|
||||
"Missing member of group {}: {}: {}".format(
|
||||
group_id, a, items))
|
||||
grouped_items[group_id] = agg_fn(group_out)
|
||||
else:
|
||||
grouped_items[agent_id] = item
|
||||
return grouped_items
|
||||
-114
@@ -1,114 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.utils.annotations import PublicAPI
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class MultiAgentEnv(object):
|
||||
"""An environment that hosts multiple independent agents.
|
||||
|
||||
Agents are identified by (string) agent ids. Note that these "agents" here
|
||||
are not to be confused with RLlib agents.
|
||||
|
||||
Examples:
|
||||
>>> env = MyMultiAgentEnv()
|
||||
>>> obs = env.reset()
|
||||
>>> print(obs)
|
||||
{
|
||||
"car_0": [2.4, 1.6],
|
||||
"car_1": [3.4, -3.2],
|
||||
"traffic_light_1": [0, 3, 5, 1],
|
||||
}
|
||||
>>> obs, rewards, dones, infos = env.step(
|
||||
action_dict={
|
||||
"car_0": 1, "car_1": 0, "traffic_light_1": 2,
|
||||
})
|
||||
>>> print(rewards)
|
||||
{
|
||||
"car_0": 3,
|
||||
"car_1": -1,
|
||||
"traffic_light_1": 0,
|
||||
}
|
||||
>>> print(dones)
|
||||
{
|
||||
"car_0": False, # car_0 is still running
|
||||
"car_1": True, # car_1 is done
|
||||
"__all__": False, # the env is not done
|
||||
}
|
||||
>>> print(infos)
|
||||
{
|
||||
"car_0": {}, # info for car_0
|
||||
"car_1": {}, # info for car_1
|
||||
}
|
||||
"""
|
||||
|
||||
@PublicAPI
|
||||
def reset(self):
|
||||
"""Resets the env and returns observations from ready agents.
|
||||
|
||||
Returns:
|
||||
obs (dict): New observations for each ready agent.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def step(self, action_dict):
|
||||
"""Returns observations from ready agents.
|
||||
|
||||
The returns are dicts mapping from agent_id strings to values. The
|
||||
number of agents in the env can vary over time.
|
||||
|
||||
Returns
|
||||
-------
|
||||
obs (dict): New observations for each ready agent.
|
||||
rewards (dict): Reward values for each ready agent. If the
|
||||
episode is just started, the value will be None.
|
||||
dones (dict): Done values for each ready agent. The special key
|
||||
"__all__" (required) is used to indicate env termination.
|
||||
infos (dict): Optional info values for each agent id.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
# yapf: disable
|
||||
# __grouping_doc_begin__
|
||||
@PublicAPI
|
||||
def with_agent_groups(self, groups, obs_space=None, act_space=None):
|
||||
"""Convenience method for grouping together agents in this env.
|
||||
|
||||
An agent group is a list of agent ids that are mapped to a single
|
||||
logical agent. All agents of the group must act at the same time in the
|
||||
environment. The grouped agent exposes Tuple action and observation
|
||||
spaces that are the concatenated action and obs spaces of the
|
||||
individual agents.
|
||||
|
||||
The rewards of all the agents in a group are summed. The individual
|
||||
agent rewards are available under the "individual_rewards" key of the
|
||||
group info return.
|
||||
|
||||
Agent grouping is required to leverage algorithms such as Q-Mix.
|
||||
|
||||
This API is experimental.
|
||||
|
||||
Arguments:
|
||||
groups (dict): Mapping from group id to a list of the agent ids
|
||||
of group members. If an agent id is not present in any group
|
||||
value, it will be left ungrouped.
|
||||
obs_space (Space): Optional observation space for the grouped
|
||||
env. Must be a tuple space.
|
||||
act_space (Space): Optional action space for the grouped env.
|
||||
Must be a tuple space.
|
||||
|
||||
Examples:
|
||||
>>> env = YourMultiAgentEnv(...)
|
||||
>>> grouped_env = env.with_agent_groups(env, {
|
||||
... "group1": ["agent1", "agent2", "agent3"],
|
||||
... "group2": ["agent4", "agent5"],
|
||||
... })
|
||||
"""
|
||||
|
||||
from ray.rllib.env.group_agents_wrapper import _GroupAgentsWrapper
|
||||
return _GroupAgentsWrapper(self, groups, obs_space, act_space)
|
||||
# __grouping_doc_end__
|
||||
# yapf: enable
|
||||
-130
@@ -1,130 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
|
||||
import ray
|
||||
from ray.rllib.env.base_env import BaseEnv, _DUMMY_AGENT_ID, ASYNC_RESET_RETURN
|
||||
from ray.rllib.utils.memory import ray_get_and_free
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class RemoteVectorEnv(BaseEnv):
|
||||
"""Vector env that executes envs in remote workers.
|
||||
|
||||
This provides dynamic batching of inference as observations are returned
|
||||
from the remote simulator actors. Both single and multi-agent child envs
|
||||
are supported, and envs can be stepped synchronously or async.
|
||||
"""
|
||||
|
||||
def __init__(self, make_env, num_envs, multiagent,
|
||||
remote_env_batch_wait_ms):
|
||||
self.make_local_env = make_env
|
||||
self.num_envs = num_envs
|
||||
self.multiagent = multiagent
|
||||
self.poll_timeout = remote_env_batch_wait_ms / 1000
|
||||
|
||||
self.actors = None # lazy init
|
||||
self.pending = None # lazy init
|
||||
|
||||
def poll(self):
|
||||
if self.actors is None:
|
||||
|
||||
def make_remote_env(i):
|
||||
logger.info("Launching env {} in remote actor".format(i))
|
||||
if self.multiagent:
|
||||
return _RemoteMultiAgentEnv.remote(self.make_local_env, i)
|
||||
else:
|
||||
return _RemoteSingleAgentEnv.remote(self.make_local_env, i)
|
||||
|
||||
self.actors = [make_remote_env(i) for i in range(self.num_envs)]
|
||||
|
||||
if self.pending is None:
|
||||
self.pending = {a.reset.remote(): a for a in self.actors}
|
||||
|
||||
# each keyed by env_id in [0, num_remote_envs)
|
||||
obs, rewards, dones, infos = {}, {}, {}, {}
|
||||
ready = []
|
||||
|
||||
# Wait for at least 1 env to be ready here
|
||||
while not ready:
|
||||
ready, _ = ray.wait(
|
||||
list(self.pending),
|
||||
num_returns=len(self.pending),
|
||||
timeout=self.poll_timeout)
|
||||
|
||||
# Get and return observations for each of the ready envs
|
||||
env_ids = set()
|
||||
for obj_id in ready:
|
||||
actor = self.pending.pop(obj_id)
|
||||
env_id = self.actors.index(actor)
|
||||
env_ids.add(env_id)
|
||||
ob, rew, done, info = ray_get_and_free(obj_id)
|
||||
obs[env_id] = ob
|
||||
rewards[env_id] = rew
|
||||
dones[env_id] = done
|
||||
infos[env_id] = info
|
||||
|
||||
logger.debug("Got obs batch for actors {}".format(env_ids))
|
||||
return obs, rewards, dones, infos, {}
|
||||
|
||||
def send_actions(self, action_dict):
|
||||
for env_id, actions in action_dict.items():
|
||||
actor = self.actors[env_id]
|
||||
obj_id = actor.step.remote(actions)
|
||||
self.pending[obj_id] = actor
|
||||
|
||||
def try_reset(self, env_id):
|
||||
actor = self.actors[env_id]
|
||||
obj_id = actor.reset.remote()
|
||||
self.pending[obj_id] = actor
|
||||
return ASYNC_RESET_RETURN
|
||||
|
||||
def stop(self):
|
||||
if self.actors is not None:
|
||||
for actor in self.actors:
|
||||
actor.__ray_terminate__.remote()
|
||||
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
class _RemoteMultiAgentEnv(object):
|
||||
"""Wrapper class for making a multi-agent env a remote actor."""
|
||||
|
||||
def __init__(self, make_env, i):
|
||||
self.env = make_env(i)
|
||||
|
||||
def reset(self):
|
||||
obs = self.env.reset()
|
||||
# each keyed by agent_id in the env
|
||||
rew = {agent_id: 0 for agent_id in obs.keys()}
|
||||
info = {agent_id: {} for agent_id in obs.keys()}
|
||||
done = {"__all__": False}
|
||||
return obs, rew, done, info
|
||||
|
||||
def step(self, action_dict):
|
||||
return self.env.step(action_dict)
|
||||
|
||||
|
||||
@ray.remote(num_cpus=0)
|
||||
class _RemoteSingleAgentEnv(object):
|
||||
"""Wrapper class for making a gym env a remote actor."""
|
||||
|
||||
def __init__(self, make_env, i):
|
||||
self.env = make_env(i)
|
||||
|
||||
def reset(self):
|
||||
obs = {_DUMMY_AGENT_ID: self.env.reset()}
|
||||
rew = {agent_id: 0 for agent_id in obs.keys()}
|
||||
info = {agent_id: {} for agent_id in obs.keys()}
|
||||
done = {"__all__": False}
|
||||
return obs, rew, done, info
|
||||
|
||||
def step(self, action):
|
||||
obs, rew, done, info = self.env.step(action[_DUMMY_AGENT_ID])
|
||||
obs, rew, done, info = [{
|
||||
_DUMMY_AGENT_ID: x
|
||||
} for x in [obs, rew, done, info]]
|
||||
done["__all__"] = done[_DUMMY_AGENT_ID]
|
||||
return obs, rew, done, info
|
||||
Vendored
-8
@@ -1,8 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.env.external_env import ExternalEnv
|
||||
|
||||
# renamed to ExternalEnv in 0.6
|
||||
ServingEnv = ExternalEnv
|
||||
Vendored
-126
@@ -1,126 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.utils.annotations import override, PublicAPI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@PublicAPI
|
||||
class VectorEnv(object):
|
||||
"""An environment that supports batch evaluation.
|
||||
|
||||
Subclasses must define the following attributes:
|
||||
|
||||
Attributes:
|
||||
action_space (gym.Space): Action space of individual envs.
|
||||
observation_space (gym.Space): Observation space of individual envs.
|
||||
num_envs (int): Number of envs in this vector env.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def wrap(make_env=None,
|
||||
existing_envs=None,
|
||||
num_envs=1,
|
||||
action_space=None,
|
||||
observation_space=None):
|
||||
return _VectorizedGymEnv(make_env, existing_envs or [], num_envs,
|
||||
action_space, observation_space)
|
||||
|
||||
@PublicAPI
|
||||
def vector_reset(self):
|
||||
"""Resets all environments.
|
||||
|
||||
Returns:
|
||||
obs (list): Vector of observations from each environment.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def reset_at(self, index):
|
||||
"""Resets a single environment.
|
||||
|
||||
Returns:
|
||||
obs (obj): Observations from the resetted environment.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def vector_step(self, actions):
|
||||
"""Vectorized step.
|
||||
|
||||
Arguments:
|
||||
actions (list): Actions for each env.
|
||||
|
||||
Returns:
|
||||
obs (list): New observations for each env.
|
||||
rewards (list): Reward values for each env.
|
||||
dones (list): Done values for each env.
|
||||
infos (list): Info values for each env.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@PublicAPI
|
||||
def get_unwrapped(self):
|
||||
"""Returns the underlying env instances."""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class _VectorizedGymEnv(VectorEnv):
|
||||
"""Internal wrapper for gym envs to implement VectorEnv.
|
||||
|
||||
Arguments:
|
||||
make_env (func|None): Factory that produces a new gym env. Must be
|
||||
defined if the number of existing envs is less than num_envs.
|
||||
existing_envs (list): List of existing gym envs.
|
||||
num_envs (int): Desired num gym envs to keep total.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
make_env,
|
||||
existing_envs,
|
||||
num_envs,
|
||||
action_space=None,
|
||||
observation_space=None):
|
||||
self.make_env = make_env
|
||||
self.envs = existing_envs
|
||||
self.num_envs = num_envs
|
||||
while len(self.envs) < self.num_envs:
|
||||
self.envs.append(self.make_env(len(self.envs)))
|
||||
self.action_space = action_space or self.envs[0].action_space
|
||||
self.observation_space = observation_space or \
|
||||
self.envs[0].observation_space
|
||||
|
||||
@override(VectorEnv)
|
||||
def vector_reset(self):
|
||||
return [e.reset() for e in self.envs]
|
||||
|
||||
@override(VectorEnv)
|
||||
def reset_at(self, index):
|
||||
return self.envs[index].reset()
|
||||
|
||||
@override(VectorEnv)
|
||||
def vector_step(self, actions):
|
||||
obs_batch, rew_batch, done_batch, info_batch = [], [], [], []
|
||||
for i in range(self.num_envs):
|
||||
obs, r, done, info = self.envs[i].step(actions[i])
|
||||
if not np.isscalar(r) or not np.isreal(r) or not np.isfinite(r):
|
||||
raise ValueError(
|
||||
"Reward should be finite scalar, got {} ({})".format(
|
||||
r, type(r)))
|
||||
if type(info) is not dict:
|
||||
raise ValueError("Info should be a dict, got {} ({})".format(
|
||||
info, type(info)))
|
||||
obs_batch.append(obs)
|
||||
rew_batch.append(r)
|
||||
done_batch.append(done)
|
||||
info_batch.append(info)
|
||||
return obs_batch, rew_batch, done_batch, info_batch
|
||||
|
||||
@override(VectorEnv)
|
||||
def get_unwrapped(self):
|
||||
return self.envs
|
||||
@@ -1,31 +0,0 @@
|
||||
from ray.rllib.evaluation.episode import MultiAgentEpisode
|
||||
from ray.rllib.evaluation.rollout_worker import RolloutWorker
|
||||
from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator
|
||||
from ray.rllib.evaluation.interface import EvaluatorInterface
|
||||
from ray.rllib.evaluation.policy_graph import PolicyGraph
|
||||
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
|
||||
from ray.rllib.evaluation.torch_policy_graph import TorchPolicyGraph
|
||||
from ray.rllib.evaluation.sample_batch import SampleBatch, MultiAgentBatch
|
||||
from ray.rllib.evaluation.sample_batch_builder import (
|
||||
SampleBatchBuilder, MultiAgentSampleBatchBuilder)
|
||||
from ray.rllib.evaluation.sampler import SyncSampler, AsyncSampler
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
|
||||
__all__ = [
|
||||
"EvaluatorInterface",
|
||||
"RolloutWorker",
|
||||
"PolicyGraph",
|
||||
"TFPolicyGraph",
|
||||
"TorchPolicyGraph",
|
||||
"SampleBatch",
|
||||
"MultiAgentBatch",
|
||||
"SampleBatchBuilder",
|
||||
"MultiAgentSampleBatchBuilder",
|
||||
"SyncSampler",
|
||||
"AsyncSampler",
|
||||
"compute_advantages",
|
||||
"collect_metrics",
|
||||
"MultiAgentEpisode",
|
||||
"PolicyEvaluator",
|
||||
]
|
||||
@@ -1,201 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import defaultdict
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.env.base_env import _DUMMY_AGENT_ID
|
||||
from ray.rllib.utils.annotations import DeveloperAPI
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
class MultiAgentEpisode(object):
|
||||
"""Tracks the current state of a (possibly multi-agent) episode.
|
||||
|
||||
Attributes:
|
||||
new_batch_builder (func): Create a new MultiAgentSampleBatchBuilder.
|
||||
add_extra_batch (func): Return a built MultiAgentBatch to the sampler.
|
||||
batch_builder (obj): Batch builder for the current episode.
|
||||
total_reward (float): Summed reward across all agents in this episode.
|
||||
length (int): Length of this episode.
|
||||
episode_id (int): Unique id identifying this trajectory.
|
||||
agent_rewards (dict): Summed rewards broken down by agent.
|
||||
custom_metrics (dict): Dict where the you can add custom metrics.
|
||||
user_data (dict): Dict that you can use for temporary storage.
|
||||
|
||||
Use case 1: Model-based rollouts in multi-agent:
|
||||
A custom compute_actions() function in a policy can inspect the
|
||||
current episode state and perform a number of rollouts based on the
|
||||
policies and state of other agents in the environment.
|
||||
|
||||
Use case 2: Returning extra rollouts data.
|
||||
The model rollouts can be returned back to the sampler by calling:
|
||||
|
||||
>>> batch = episode.new_batch_builder()
|
||||
>>> for each transition:
|
||||
batch.add_values(...) # see sampler for usage
|
||||
>>> episode.extra_batches.add(batch.build_and_reset())
|
||||
"""
|
||||
|
||||
def __init__(self, policies, policy_mapping_fn, batch_builder_factory,
|
||||
extra_batch_callback):
|
||||
self.new_batch_builder = batch_builder_factory
|
||||
self.add_extra_batch = extra_batch_callback
|
||||
self.batch_builder = batch_builder_factory()
|
||||
self.total_reward = 0.0
|
||||
self.length = 0
|
||||
self.episode_id = random.randrange(2e9)
|
||||
self.agent_rewards = defaultdict(float)
|
||||
self.custom_metrics = {}
|
||||
self.user_data = {}
|
||||
self._policies = policies
|
||||
self._policy_mapping_fn = policy_mapping_fn
|
||||
self._next_agent_index = 0
|
||||
self._agent_to_index = {}
|
||||
self._agent_to_policy = {}
|
||||
self._agent_to_rnn_state = {}
|
||||
self._agent_to_last_obs = {}
|
||||
self._agent_to_last_raw_obs = {}
|
||||
self._agent_to_last_info = {}
|
||||
self._agent_to_last_action = {}
|
||||
self._agent_to_last_pi_info = {}
|
||||
self._agent_to_prev_action = {}
|
||||
self._agent_reward_history = defaultdict(list)
|
||||
|
||||
@DeveloperAPI
|
||||
def soft_reset(self):
|
||||
"""Clears rewards and metrics, but retains RNN and other state.
|
||||
|
||||
This is used to carry state across multiple logical episodes in the
|
||||
same env (i.e., if `soft_horizon` is set).
|
||||
"""
|
||||
self.length = 0
|
||||
self.episode_id = random.randrange(2e9)
|
||||
self.total_reward = 0.0
|
||||
self.agent_rewards = defaultdict(float)
|
||||
self._agent_reward_history = defaultdict(list)
|
||||
|
||||
@DeveloperAPI
|
||||
def policy_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the policy for the specified agent.
|
||||
|
||||
If the agent is new, the policy mapping fn will be called to bind the
|
||||
agent to a policy for the duration of the episode.
|
||||
"""
|
||||
|
||||
if agent_id not in self._agent_to_policy:
|
||||
self._agent_to_policy[agent_id] = self._policy_mapping_fn(agent_id)
|
||||
return self._agent_to_policy[agent_id]
|
||||
|
||||
@DeveloperAPI
|
||||
def last_observation_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the last observation for the specified agent."""
|
||||
|
||||
return self._agent_to_last_obs.get(agent_id)
|
||||
|
||||
@DeveloperAPI
|
||||
def last_raw_obs_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the last un-preprocessed obs for the specified agent."""
|
||||
|
||||
return self._agent_to_last_raw_obs.get(agent_id)
|
||||
|
||||
@DeveloperAPI
|
||||
def last_info_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the last info for the specified agent."""
|
||||
|
||||
return self._agent_to_last_info.get(agent_id)
|
||||
|
||||
@DeveloperAPI
|
||||
def last_action_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the last action for the specified agent, or zeros."""
|
||||
|
||||
if agent_id in self._agent_to_last_action:
|
||||
return _flatten_action(self._agent_to_last_action[agent_id])
|
||||
else:
|
||||
policy = self._policies[self.policy_for(agent_id)]
|
||||
flat = _flatten_action(policy.action_space.sample())
|
||||
return np.zeros_like(flat)
|
||||
|
||||
@DeveloperAPI
|
||||
def prev_action_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the previous action for the specified agent."""
|
||||
|
||||
if agent_id in self._agent_to_prev_action:
|
||||
return _flatten_action(self._agent_to_prev_action[agent_id])
|
||||
else:
|
||||
# We're at t=0, so return all zeros.
|
||||
return np.zeros_like(self.last_action_for(agent_id))
|
||||
|
||||
@DeveloperAPI
|
||||
def prev_reward_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the previous reward for the specified agent."""
|
||||
|
||||
history = self._agent_reward_history[agent_id]
|
||||
if len(history) >= 2:
|
||||
return history[-2]
|
||||
else:
|
||||
# We're at t=0, so there is no previous reward, just return zero.
|
||||
return 0.0
|
||||
|
||||
@DeveloperAPI
|
||||
def rnn_state_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the last RNN state for the specified agent."""
|
||||
|
||||
if agent_id not in self._agent_to_rnn_state:
|
||||
policy = self._policies[self.policy_for(agent_id)]
|
||||
self._agent_to_rnn_state[agent_id] = policy.get_initial_state()
|
||||
return self._agent_to_rnn_state[agent_id]
|
||||
|
||||
@DeveloperAPI
|
||||
def last_pi_info_for(self, agent_id=_DUMMY_AGENT_ID):
|
||||
"""Returns the last info object for the specified agent."""
|
||||
|
||||
return self._agent_to_last_pi_info[agent_id]
|
||||
|
||||
def _add_agent_rewards(self, reward_dict):
|
||||
for agent_id, reward in reward_dict.items():
|
||||
if reward is not None:
|
||||
self.agent_rewards[agent_id,
|
||||
self.policy_for(agent_id)] += reward
|
||||
self.total_reward += reward
|
||||
self._agent_reward_history[agent_id].append(reward)
|
||||
|
||||
def _set_rnn_state(self, agent_id, rnn_state):
|
||||
self._agent_to_rnn_state[agent_id] = rnn_state
|
||||
|
||||
def _set_last_observation(self, agent_id, obs):
|
||||
self._agent_to_last_obs[agent_id] = obs
|
||||
|
||||
def _set_last_raw_obs(self, agent_id, obs):
|
||||
self._agent_to_last_raw_obs[agent_id] = obs
|
||||
|
||||
def _set_last_info(self, agent_id, info):
|
||||
self._agent_to_last_info[agent_id] = info
|
||||
|
||||
def _set_last_action(self, agent_id, action):
|
||||
if agent_id in self._agent_to_last_action:
|
||||
self._agent_to_prev_action[agent_id] = \
|
||||
self._agent_to_last_action[agent_id]
|
||||
self._agent_to_last_action[agent_id] = action
|
||||
|
||||
def _set_last_pi_info(self, agent_id, pi_info):
|
||||
self._agent_to_last_pi_info[agent_id] = pi_info
|
||||
|
||||
def _agent_index(self, agent_id):
|
||||
if agent_id not in self._agent_to_index:
|
||||
self._agent_to_index[agent_id] = self._next_agent_index
|
||||
self._next_agent_index += 1
|
||||
return self._agent_to_index[agent_id]
|
||||
|
||||
|
||||
def _flatten_action(action):
|
||||
# Concatenate tuple actions
|
||||
if isinstance(action, list) or isinstance(action, tuple):
|
||||
expanded = []
|
||||
for a in action:
|
||||
expanded.append(np.reshape(a, [-1]))
|
||||
action = np.concatenate(expanded, axis=0).flatten()
|
||||
return action
|
||||
@@ -1,128 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
|
||||
from ray.rllib.utils.annotations import DeveloperAPI
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
class EvaluatorInterface(object):
|
||||
"""This is the interface between policy optimizers and policy evaluation.
|
||||
|
||||
See also: RolloutWorker
|
||||
"""
|
||||
|
||||
@DeveloperAPI
|
||||
def sample(self):
|
||||
"""Returns a batch of experience sampled from this evaluator.
|
||||
|
||||
This method must be implemented by subclasses.
|
||||
|
||||
Returns:
|
||||
SampleBatch|MultiAgentBatch: A columnar batch of experiences
|
||||
(e.g., tensors), or a multi-agent batch.
|
||||
|
||||
Examples:
|
||||
>>> print(ev.sample())
|
||||
SampleBatch({"obs": [1, 2, 3], "action": [0, 1, 0], ...})
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@DeveloperAPI
|
||||
def learn_on_batch(self, samples):
|
||||
"""Update policies based on the given batch.
|
||||
|
||||
This is the equivalent to apply_gradients(compute_gradients(samples)),
|
||||
but can be optimized to avoid pulling gradients into CPU memory.
|
||||
|
||||
Either this or the combination of compute/apply grads must be
|
||||
implemented by subclasses.
|
||||
|
||||
Returns:
|
||||
info: dictionary of extra metadata from compute_gradients().
|
||||
|
||||
Examples:
|
||||
>>> batch = ev.sample()
|
||||
>>> ev.learn_on_batch(samples)
|
||||
"""
|
||||
|
||||
grads, info = self.compute_gradients(samples)
|
||||
self.apply_gradients(grads)
|
||||
return info
|
||||
|
||||
@DeveloperAPI
|
||||
def compute_gradients(self, samples):
|
||||
"""Returns a gradient computed w.r.t the specified samples.
|
||||
|
||||
Either this or learn_on_batch() must be implemented by subclasses.
|
||||
|
||||
Returns:
|
||||
(grads, info): A list of gradients that can be applied on a
|
||||
compatible evaluator. In the multi-agent case, returns a dict
|
||||
of gradients keyed by policy ids. An info dictionary of
|
||||
extra metadata is also returned.
|
||||
|
||||
Examples:
|
||||
>>> batch = ev.sample()
|
||||
>>> grads, info = ev2.compute_gradients(samples)
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@DeveloperAPI
|
||||
def apply_gradients(self, grads):
|
||||
"""Applies the given gradients to this evaluator's weights.
|
||||
|
||||
Either this or learn_on_batch() must be implemented by subclasses.
|
||||
|
||||
Examples:
|
||||
>>> samples = ev1.sample()
|
||||
>>> grads, info = ev2.compute_gradients(samples)
|
||||
>>> ev1.apply_gradients(grads)
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@DeveloperAPI
|
||||
def get_weights(self):
|
||||
"""Returns the model weights of this Evaluator.
|
||||
|
||||
This method must be implemented by subclasses.
|
||||
|
||||
Returns:
|
||||
object: weights that can be set on a compatible evaluator.
|
||||
info: dictionary of extra metadata.
|
||||
|
||||
Examples:
|
||||
>>> weights = ev1.get_weights()
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@DeveloperAPI
|
||||
def set_weights(self, weights):
|
||||
"""Sets the model weights of this Evaluator.
|
||||
|
||||
This method must be implemented by subclasses.
|
||||
|
||||
Examples:
|
||||
>>> weights = ev1.get_weights()
|
||||
>>> ev2.set_weights(weights)
|
||||
"""
|
||||
|
||||
raise NotImplementedError
|
||||
|
||||
@DeveloperAPI
|
||||
def get_host(self):
|
||||
"""Returns the hostname of the process running this evaluator."""
|
||||
|
||||
return os.uname()[1]
|
||||
|
||||
@DeveloperAPI
|
||||
def apply(self, func, *args):
|
||||
"""Apply the given function to this evaluator instance."""
|
||||
|
||||
return func(self, *args)
|
||||
@@ -1,173 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import numpy as np
|
||||
import collections
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.rollout_metrics import RolloutMetrics
|
||||
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID
|
||||
from ray.rllib.offline.off_policy_estimator import OffPolicyEstimate
|
||||
from ray.rllib.policy.policy import LEARNER_STATS_KEY
|
||||
from ray.rllib.utils.annotations import DeveloperAPI
|
||||
from ray.rllib.utils.memory import ray_get_and_free
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def get_learner_stats(grad_info):
|
||||
"""Return optimization stats reported from the policy.
|
||||
|
||||
Example:
|
||||
>>> grad_info = evaluator.learn_on_batch(samples)
|
||||
>>> print(get_stats(grad_info))
|
||||
{"vf_loss": ..., "policy_loss": ...}
|
||||
"""
|
||||
|
||||
if LEARNER_STATS_KEY in grad_info:
|
||||
return grad_info[LEARNER_STATS_KEY]
|
||||
|
||||
multiagent_stats = {}
|
||||
for k, v in grad_info.items():
|
||||
if type(v) is dict:
|
||||
if LEARNER_STATS_KEY in v:
|
||||
multiagent_stats[k] = v[LEARNER_STATS_KEY]
|
||||
|
||||
return multiagent_stats
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def collect_metrics(local_worker=None,
|
||||
remote_workers=[],
|
||||
to_be_collected=[],
|
||||
timeout_seconds=180):
|
||||
"""Gathers episode metrics from RolloutWorker instances."""
|
||||
|
||||
episodes, to_be_collected = collect_episodes(
|
||||
local_worker,
|
||||
remote_workers,
|
||||
to_be_collected,
|
||||
timeout_seconds=timeout_seconds)
|
||||
metrics = summarize_episodes(episodes, episodes)
|
||||
return metrics
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def collect_episodes(local_worker=None,
|
||||
remote_workers=[],
|
||||
to_be_collected=[],
|
||||
timeout_seconds=180):
|
||||
"""Gathers new episodes metrics tuples from the given evaluators."""
|
||||
|
||||
if remote_workers:
|
||||
pending = [
|
||||
a.apply.remote(lambda ev: ev.get_metrics()) for a in remote_workers
|
||||
] + to_be_collected
|
||||
collected, to_be_collected = ray.wait(
|
||||
pending, num_returns=len(pending), timeout=timeout_seconds * 1.0)
|
||||
if pending and len(collected) == 0:
|
||||
logger.warning(
|
||||
"WARNING: collected no metrics in {} seconds".format(
|
||||
timeout_seconds))
|
||||
metric_lists = ray_get_and_free(collected)
|
||||
else:
|
||||
metric_lists = []
|
||||
|
||||
if local_worker:
|
||||
metric_lists.append(local_worker.get_metrics())
|
||||
episodes = []
|
||||
for metrics in metric_lists:
|
||||
episodes.extend(metrics)
|
||||
return episodes, to_be_collected
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def summarize_episodes(episodes, new_episodes):
|
||||
"""Summarizes a set of episode metrics tuples.
|
||||
|
||||
Arguments:
|
||||
episodes: smoothed set of episodes including historical ones
|
||||
new_episodes: just the new episodes in this iteration
|
||||
"""
|
||||
|
||||
episodes, estimates = _partition(episodes)
|
||||
new_episodes, _ = _partition(new_episodes)
|
||||
|
||||
episode_rewards = []
|
||||
episode_lengths = []
|
||||
policy_rewards = collections.defaultdict(list)
|
||||
custom_metrics = collections.defaultdict(list)
|
||||
perf_stats = collections.defaultdict(list)
|
||||
for episode in episodes:
|
||||
episode_lengths.append(episode.episode_length)
|
||||
episode_rewards.append(episode.episode_reward)
|
||||
for k, v in episode.custom_metrics.items():
|
||||
custom_metrics[k].append(v)
|
||||
for k, v in episode.perf_stats.items():
|
||||
perf_stats[k].append(v)
|
||||
for (_, policy_id), reward in episode.agent_rewards.items():
|
||||
if policy_id != DEFAULT_POLICY_ID:
|
||||
policy_rewards[policy_id].append(reward)
|
||||
if episode_rewards:
|
||||
min_reward = min(episode_rewards)
|
||||
max_reward = max(episode_rewards)
|
||||
else:
|
||||
min_reward = float("nan")
|
||||
max_reward = float("nan")
|
||||
avg_reward = np.mean(episode_rewards)
|
||||
avg_length = np.mean(episode_lengths)
|
||||
|
||||
for policy_id, rewards in policy_rewards.copy().items():
|
||||
policy_rewards[policy_id] = np.mean(rewards)
|
||||
|
||||
for k, v_list in custom_metrics.copy().items():
|
||||
custom_metrics[k + "_mean"] = np.mean(v_list)
|
||||
filt = [v for v in v_list if not np.isnan(v)]
|
||||
if filt:
|
||||
custom_metrics[k + "_min"] = np.min(filt)
|
||||
custom_metrics[k + "_max"] = np.max(filt)
|
||||
else:
|
||||
custom_metrics[k + "_min"] = float("nan")
|
||||
custom_metrics[k + "_max"] = float("nan")
|
||||
del custom_metrics[k]
|
||||
|
||||
for k, v_list in perf_stats.copy().items():
|
||||
perf_stats[k] = np.mean(v_list)
|
||||
|
||||
estimators = collections.defaultdict(lambda: collections.defaultdict(list))
|
||||
for e in estimates:
|
||||
acc = estimators[e.estimator_name]
|
||||
for k, v in e.metrics.items():
|
||||
acc[k].append(v)
|
||||
for name, metrics in estimators.items():
|
||||
for k, v_list in metrics.items():
|
||||
metrics[k] = np.mean(v_list)
|
||||
estimators[name] = dict(metrics)
|
||||
|
||||
return dict(
|
||||
episode_reward_max=max_reward,
|
||||
episode_reward_min=min_reward,
|
||||
episode_reward_mean=avg_reward,
|
||||
episode_len_mean=avg_length,
|
||||
episodes_this_iter=len(new_episodes),
|
||||
policy_reward_mean=dict(policy_rewards),
|
||||
custom_metrics=dict(custom_metrics),
|
||||
sampler_perf=dict(perf_stats),
|
||||
off_policy_estimator=dict(estimators))
|
||||
|
||||
|
||||
def _partition(episodes):
|
||||
"""Divides metrics data into true rollouts vs off-policy estimates."""
|
||||
|
||||
rollouts, estimates = [], []
|
||||
for e in episodes:
|
||||
if isinstance(e, RolloutMetrics):
|
||||
rollouts.append(e)
|
||||
elif isinstance(e, OffPolicyEstimate):
|
||||
estimates.append(e)
|
||||
else:
|
||||
raise ValueError("Unknown metric type: {}".format(e))
|
||||
return rollouts, estimates
|
||||
@@ -1,9 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.utils import renamed_class
|
||||
from ray.rllib.evaluation import RolloutWorker
|
||||
|
||||
PolicyEvaluator = renamed_class(
|
||||
RolloutWorker, old_name="rllib.evaluation.PolicyEvaluator")
|
||||
@@ -1,8 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.utils import renamed_class
|
||||
|
||||
PolicyGraph = renamed_class(Policy, old_name="PolicyGraph")
|
||||
@@ -1,70 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import numpy as np
|
||||
import scipy.signal
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.annotations import DeveloperAPI
|
||||
|
||||
|
||||
def discount(x, gamma):
|
||||
return scipy.signal.lfilter([1], [1, -gamma], x[::-1], axis=0)[::-1]
|
||||
|
||||
|
||||
class Postprocessing(object):
|
||||
"""Constant definitions for postprocessing."""
|
||||
|
||||
ADVANTAGES = "advantages"
|
||||
VALUE_TARGETS = "value_targets"
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def compute_advantages(rollout, last_r, gamma=0.9, lambda_=1.0, use_gae=True):
|
||||
"""Given a rollout, compute its value targets and the advantage.
|
||||
|
||||
Args:
|
||||
rollout (SampleBatch): SampleBatch of a single trajectory
|
||||
last_r (float): Value estimation for last observation
|
||||
gamma (float): Discount factor.
|
||||
lambda_ (float): Parameter for GAE
|
||||
use_gae (bool): Using Generalized Advantage Estamation
|
||||
|
||||
Returns:
|
||||
SampleBatch (SampleBatch): Object with experience from rollout and
|
||||
processed rewards.
|
||||
"""
|
||||
|
||||
traj = {}
|
||||
trajsize = len(rollout[SampleBatch.ACTIONS])
|
||||
for key in rollout:
|
||||
traj[key] = np.stack(rollout[key])
|
||||
|
||||
if use_gae:
|
||||
assert SampleBatch.VF_PREDS in rollout, "Values not found!"
|
||||
vpred_t = np.concatenate(
|
||||
[rollout[SampleBatch.VF_PREDS],
|
||||
np.array([last_r])])
|
||||
delta_t = (
|
||||
traj[SampleBatch.REWARDS] + gamma * vpred_t[1:] - vpred_t[:-1])
|
||||
# This formula for the advantage comes
|
||||
# "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
|
||||
traj[Postprocessing.ADVANTAGES] = discount(delta_t, gamma * lambda_)
|
||||
traj[Postprocessing.VALUE_TARGETS] = (
|
||||
traj[Postprocessing.ADVANTAGES] +
|
||||
traj[SampleBatch.VF_PREDS]).copy().astype(np.float32)
|
||||
else:
|
||||
rewards_plus_v = np.concatenate(
|
||||
[rollout[SampleBatch.REWARDS],
|
||||
np.array([last_r])])
|
||||
traj[Postprocessing.ADVANTAGES] = discount(rewards_plus_v, gamma)[:-1]
|
||||
# TODO(ekl): support using a critic without GAE
|
||||
traj[Postprocessing.VALUE_TARGETS] = np.zeros_like(
|
||||
traj[Postprocessing.ADVANTAGES])
|
||||
|
||||
traj[Postprocessing.ADVANTAGES] = traj[
|
||||
Postprocessing.ADVANTAGES].copy().astype(np.float32)
|
||||
|
||||
assert all(val.shape[0] == trajsize for val in traj.values()), \
|
||||
"Rollout stacked incorrectly!"
|
||||
return SampleBatch(traj)
|
||||
@@ -1,11 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import collections
|
||||
|
||||
# Define this in its own file, see #5125
|
||||
RolloutMetrics = collections.namedtuple("RolloutMetrics", [
|
||||
"episode_length", "episode_reward", "agent_rewards", "custom_metrics",
|
||||
"perf_stats"
|
||||
])
|
||||
@@ -1,819 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
import gym
|
||||
import logging
|
||||
import pickle
|
||||
|
||||
import ray
|
||||
from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari
|
||||
from ray.rllib.env.base_env import BaseEnv
|
||||
from ray.rllib.env.env_context import EnvContext
|
||||
from ray.rllib.env.external_env import ExternalEnv
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.evaluation.interface import EvaluatorInterface
|
||||
from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler
|
||||
from ray.rllib.policy.sample_batch import MultiAgentBatch, DEFAULT_POLICY_ID
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.policy.tf_policy import TFPolicy
|
||||
from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader
|
||||
from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator
|
||||
from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.preprocessors import NoPreprocessor
|
||||
from ray.rllib.utils import merge_dicts
|
||||
from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
from ray.rllib.utils.debug import disable_log_once_globally, log_once, \
|
||||
summarize, enable_periodic_logging
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.utils.tf_run_builder import TFRunBuilder
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Handle to the current rollout worker, which will be set to the most recently
|
||||
# created RolloutWorker in this process. This can be helpful to access in
|
||||
# custom env or policy classes for debugging or advanced use cases.
|
||||
_global_worker = None
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
def get_global_worker():
|
||||
"""Returns a handle to the active rollout worker in this process."""
|
||||
|
||||
global _global_worker
|
||||
return _global_worker
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
class RolloutWorker(EvaluatorInterface):
|
||||
"""Common experience collection class.
|
||||
|
||||
This class wraps a policy instance and an environment class to
|
||||
collect experiences from the environment. You can create many replicas of
|
||||
this class as Ray actors to scale RL training.
|
||||
|
||||
This class supports vectorized and multi-agent policy evaluation (e.g.,
|
||||
VectorEnv, MultiAgentEnv, etc.)
|
||||
|
||||
Examples:
|
||||
>>> # Create a rollout worker and using it to collect experiences.
|
||||
>>> worker = RolloutWorker(
|
||||
... env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
... policy=PGTFPolicy)
|
||||
>>> print(worker.sample())
|
||||
SampleBatch({
|
||||
"obs": [[...]], "actions": [[...]], "rewards": [[...]],
|
||||
"dones": [[...]], "new_obs": [[...]]})
|
||||
|
||||
>>> # Creating a multi-agent rollout worker
|
||||
>>> worker = RolloutWorker(
|
||||
... env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25),
|
||||
... policies={
|
||||
... # Use an ensemble of two policies for car agents
|
||||
... "car_policy1":
|
||||
... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}),
|
||||
... "car_policy2":
|
||||
... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}),
|
||||
... # Use a single shared policy for all traffic lights
|
||||
... "traffic_light_policy":
|
||||
... (PGTFPolicy, Box(...), Discrete(...), {}),
|
||||
... },
|
||||
... policy_mapping_fn=lambda agent_id:
|
||||
... random.choice(["car_policy1", "car_policy2"])
|
||||
... if agent_id.startswith("car_") else "traffic_light_policy")
|
||||
>>> print(worker.sample())
|
||||
MultiAgentBatch({
|
||||
"car_policy1": SampleBatch(...),
|
||||
"car_policy2": SampleBatch(...),
|
||||
"traffic_light_policy": SampleBatch(...)})
|
||||
"""
|
||||
|
||||
@DeveloperAPI
|
||||
@classmethod
|
||||
def as_remote(cls, num_cpus=None, num_gpus=None, resources=None):
|
||||
return ray.remote(
|
||||
num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)(cls)
|
||||
|
||||
@DeveloperAPI
|
||||
def __init__(self,
|
||||
env_creator,
|
||||
policy,
|
||||
policy_mapping_fn=None,
|
||||
policies_to_train=None,
|
||||
tf_session_creator=None,
|
||||
batch_steps=100,
|
||||
batch_mode="truncate_episodes",
|
||||
episode_horizon=None,
|
||||
preprocessor_pref="deepmind",
|
||||
sample_async=False,
|
||||
compress_observations=False,
|
||||
num_envs=1,
|
||||
observation_filter="NoFilter",
|
||||
clip_rewards=None,
|
||||
clip_actions=True,
|
||||
env_config=None,
|
||||
model_config=None,
|
||||
policy_config=None,
|
||||
worker_index=0,
|
||||
monitor_path=None,
|
||||
log_dir=None,
|
||||
log_level=None,
|
||||
callbacks=None,
|
||||
input_creator=lambda ioctx: ioctx.default_sampler_input(),
|
||||
input_evaluation=frozenset([]),
|
||||
output_creator=lambda ioctx: NoopOutput(),
|
||||
remote_worker_envs=False,
|
||||
remote_env_batch_wait_ms=0,
|
||||
soft_horizon=False,
|
||||
no_done_at_end=False,
|
||||
seed=None,
|
||||
_fake_sampler=False):
|
||||
"""Initialize a rollout worker.
|
||||
|
||||
Arguments:
|
||||
env_creator (func): Function that returns a gym.Env given an
|
||||
EnvContext wrapped configuration.
|
||||
policy (class|dict): Either a class implementing
|
||||
Policy, or a dictionary of policy id strings to
|
||||
(Policy, obs_space, action_space, config) tuples. If a
|
||||
dict is specified, then we are in multi-agent mode and a
|
||||
policy_mapping_fn should also be set.
|
||||
policy_mapping_fn (func): A function that maps agent ids to
|
||||
policy ids in multi-agent mode. This function will be called
|
||||
each time a new agent appears in an episode, to bind that agent
|
||||
to a policy for the duration of the episode.
|
||||
policies_to_train (list): Optional whitelist of policies to train,
|
||||
or None for all policies.
|
||||
tf_session_creator (func): A function that returns a TF session.
|
||||
This is optional and only useful with TFPolicy.
|
||||
batch_steps (int): The target number of env transitions to include
|
||||
in each sample batch returned from this worker.
|
||||
batch_mode (str): One of the following batch modes:
|
||||
"truncate_episodes": Each call to sample() will return a batch
|
||||
of at most `batch_steps * num_envs` in size. The batch will
|
||||
be exactly `batch_steps * num_envs` in size if
|
||||
postprocessing does not change batch sizes. Episodes may be
|
||||
truncated in order to meet this size requirement.
|
||||
"complete_episodes": Each call to sample() will return a batch
|
||||
of at least `batch_steps * num_envs` in size. Episodes will
|
||||
not be truncated, but multiple episodes may be packed
|
||||
within one batch to meet the batch size. Note that when
|
||||
`num_envs > 1`, episode steps will be buffered until the
|
||||
episode completes, and hence batches may contain
|
||||
significant amounts of off-policy data.
|
||||
episode_horizon (int): Whether to stop episodes at this horizon.
|
||||
preprocessor_pref (str): Whether to prefer RLlib preprocessors
|
||||
("rllib") or deepmind ("deepmind") when applicable.
|
||||
sample_async (bool): Whether to compute samples asynchronously in
|
||||
the background, which improves throughput but can cause samples
|
||||
to be slightly off-policy.
|
||||
compress_observations (bool): If true, compress the observations.
|
||||
They can be decompressed with rllib/utils/compression.
|
||||
num_envs (int): If more than one, will create multiple envs
|
||||
and vectorize the computation of actions. This has no effect if
|
||||
if the env already implements VectorEnv.
|
||||
observation_filter (str): Name of observation filter to use.
|
||||
clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to
|
||||
experience postprocessing. Setting to None means clip for Atari
|
||||
only.
|
||||
clip_actions (bool): Whether to clip action values to the range
|
||||
specified by the policy action space.
|
||||
env_config (dict): Config to pass to the env creator.
|
||||
model_config (dict): Config to use when creating the policy model.
|
||||
policy_config (dict): Config to pass to the policy. In the
|
||||
multi-agent case, this config will be merged with the
|
||||
per-policy configs specified by `policy`.
|
||||
worker_index (int): For remote workers, this should be set to a
|
||||
non-zero and unique value. This index is passed to created envs
|
||||
through EnvContext so that envs can be configured per worker.
|
||||
monitor_path (str): Write out episode stats and videos to this
|
||||
directory if specified.
|
||||
log_dir (str): Directory where logs can be placed.
|
||||
log_level (str): Set the root log level on creation.
|
||||
callbacks (dict): Dict of custom debug callbacks.
|
||||
input_creator (func): Function that returns an InputReader object
|
||||
for loading previous generated experiences.
|
||||
input_evaluation (list): How to evaluate the policy performance.
|
||||
This only makes sense to set when the input is reading offline
|
||||
data. The possible values include:
|
||||
- "is": the step-wise importance sampling estimator.
|
||||
- "wis": the weighted step-wise is estimator.
|
||||
- "simulation": run the environment in the background, but
|
||||
use this data for evaluation only and never for learning.
|
||||
output_creator (func): Function that returns an OutputWriter object
|
||||
for saving generated experiences.
|
||||
remote_worker_envs (bool): If using num_envs > 1, whether to create
|
||||
those new envs in remote processes instead of in the current
|
||||
process. This adds overheads, but can make sense if your envs
|
||||
remote_env_batch_wait_ms (float): Timeout that remote workers
|
||||
are waiting when polling environments. 0 (continue when at
|
||||
least one env is ready) is a reasonable default, but optimal
|
||||
value could be obtained by measuring your environment
|
||||
step / reset and model inference perf.
|
||||
soft_horizon (bool): Calculate rewards but don't reset the
|
||||
environment when the horizon is hit.
|
||||
no_done_at_end (bool): Ignore the done=True at the end of the
|
||||
episode and instead record done=False.
|
||||
seed (int): Set the seed of both np and tf to this value to
|
||||
to ensure each remote worker has unique exploration behavior.
|
||||
_fake_sampler (bool): Use a fake (inf speed) sampler for testing.
|
||||
"""
|
||||
|
||||
global _global_worker
|
||||
_global_worker = self
|
||||
|
||||
if log_level:
|
||||
logging.getLogger("ray.rllib").setLevel(log_level)
|
||||
|
||||
if worker_index > 1:
|
||||
disable_log_once_globally() # only need 1 worker to log
|
||||
elif log_level == "DEBUG":
|
||||
enable_periodic_logging()
|
||||
|
||||
env_context = EnvContext(env_config or {}, worker_index)
|
||||
policy_config = policy_config or {}
|
||||
self.policy_config = policy_config
|
||||
self.callbacks = callbacks or {}
|
||||
self.worker_index = worker_index
|
||||
model_config = model_config or {}
|
||||
policy_mapping_fn = (policy_mapping_fn
|
||||
or (lambda agent_id: DEFAULT_POLICY_ID))
|
||||
if not callable(policy_mapping_fn):
|
||||
raise ValueError(
|
||||
"Policy mapping function not callable. If you're using Tune, "
|
||||
"make sure to escape the function with tune.function() "
|
||||
"to prevent it from being evaluated as an expression.")
|
||||
self.env_creator = env_creator
|
||||
self.sample_batch_size = batch_steps * num_envs
|
||||
self.batch_mode = batch_mode
|
||||
self.compress_observations = compress_observations
|
||||
self.preprocessing_enabled = True
|
||||
self.last_batch = None
|
||||
self._fake_sampler = _fake_sampler
|
||||
|
||||
self.env = _validate_env(env_creator(env_context))
|
||||
if isinstance(self.env, MultiAgentEnv) or \
|
||||
isinstance(self.env, BaseEnv):
|
||||
|
||||
def wrap(env):
|
||||
return env # we can't auto-wrap these env types
|
||||
elif is_atari(self.env) and \
|
||||
not model_config.get("custom_preprocessor") and \
|
||||
preprocessor_pref == "deepmind":
|
||||
|
||||
# Deepmind wrappers already handle all preprocessing
|
||||
self.preprocessing_enabled = False
|
||||
|
||||
if clip_rewards is None:
|
||||
clip_rewards = True
|
||||
|
||||
def wrap(env):
|
||||
env = wrap_deepmind(
|
||||
env,
|
||||
dim=model_config.get("dim"),
|
||||
framestack=model_config.get("framestack"))
|
||||
if monitor_path:
|
||||
env = gym.wrappers.Monitor(env, monitor_path, resume=True)
|
||||
return env
|
||||
else:
|
||||
|
||||
def wrap(env):
|
||||
if monitor_path:
|
||||
env = gym.wrappers.Monitor(env, monitor_path, resume=True)
|
||||
return env
|
||||
|
||||
self.env = wrap(self.env)
|
||||
|
||||
def make_env(vector_index):
|
||||
return wrap(
|
||||
env_creator(
|
||||
env_context.copy_with_overrides(
|
||||
vector_index=vector_index, remote=remote_worker_envs)))
|
||||
|
||||
self.tf_sess = None
|
||||
policy_dict = _validate_and_canonicalize(policy, self.env)
|
||||
self.policies_to_train = policies_to_train or list(policy_dict.keys())
|
||||
# set numpy and python seed
|
||||
if seed is not None:
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
if not hasattr(self.env, "seed"):
|
||||
raise ValueError("Env doesn't support env.seed(): {}".format(
|
||||
self.env))
|
||||
self.env.seed(seed)
|
||||
try:
|
||||
import torch
|
||||
torch.manual_seed(seed)
|
||||
except ImportError:
|
||||
logger.info("Could not seed torch")
|
||||
if _has_tensorflow_graph(policy_dict):
|
||||
if (ray.is_initialized()
|
||||
and ray.worker._mode() != ray.worker.LOCAL_MODE
|
||||
and not ray.get_gpu_ids()):
|
||||
logger.debug("Creating policy evaluation worker {}".format(
|
||||
worker_index) +
|
||||
" on CPU (please ignore any CUDA init errors)")
|
||||
if not tf:
|
||||
raise ImportError("Could not import tensorflow")
|
||||
with tf.Graph().as_default():
|
||||
if tf_session_creator:
|
||||
self.tf_sess = tf_session_creator()
|
||||
else:
|
||||
self.tf_sess = tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
with self.tf_sess.as_default():
|
||||
# set graph-level seed
|
||||
if seed is not None:
|
||||
tf.set_random_seed(seed)
|
||||
self.policy_map, self.preprocessors = \
|
||||
self._build_policy_map(policy_dict, policy_config)
|
||||
else:
|
||||
self.policy_map, self.preprocessors = self._build_policy_map(
|
||||
policy_dict, policy_config)
|
||||
|
||||
self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID}
|
||||
if self.multiagent:
|
||||
if not ((isinstance(self.env, MultiAgentEnv)
|
||||
or isinstance(self.env, ExternalMultiAgentEnv))
|
||||
or isinstance(self.env, BaseEnv)):
|
||||
raise ValueError(
|
||||
"Have multiple policies {}, but the env ".format(
|
||||
self.policy_map) +
|
||||
"{} is not a subclass of BaseEnv, MultiAgentEnv or "
|
||||
"ExternalMultiAgentEnv?".format(self.env))
|
||||
|
||||
self.filters = {
|
||||
policy_id: get_filter(observation_filter,
|
||||
policy.observation_space.shape)
|
||||
for (policy_id, policy) in self.policy_map.items()
|
||||
}
|
||||
if self.worker_index == 0:
|
||||
logger.info("Built filter map: {}".format(self.filters))
|
||||
|
||||
# Always use vector env for consistency even if num_envs = 1
|
||||
self.async_env = BaseEnv.to_base_env(
|
||||
self.env,
|
||||
make_env=make_env,
|
||||
num_envs=num_envs,
|
||||
remote_envs=remote_worker_envs,
|
||||
remote_env_batch_wait_ms=remote_env_batch_wait_ms)
|
||||
self.num_envs = num_envs
|
||||
|
||||
if self.batch_mode == "truncate_episodes":
|
||||
unroll_length = batch_steps
|
||||
pack_episodes = True
|
||||
elif self.batch_mode == "complete_episodes":
|
||||
unroll_length = float("inf") # never cut episodes
|
||||
pack_episodes = False # sampler will return 1 episode per poll
|
||||
else:
|
||||
raise ValueError("Unsupported batch mode: {}".format(
|
||||
self.batch_mode))
|
||||
|
||||
self.io_context = IOContext(log_dir, policy_config, worker_index, self)
|
||||
self.reward_estimators = []
|
||||
for method in input_evaluation:
|
||||
if method == "simulation":
|
||||
logger.warning(
|
||||
"Requested 'simulation' input evaluation method: "
|
||||
"will discard all sampler outputs and keep only metrics.")
|
||||
sample_async = True
|
||||
elif method == "is":
|
||||
ise = ImportanceSamplingEstimator.create(self.io_context)
|
||||
self.reward_estimators.append(ise)
|
||||
elif method == "wis":
|
||||
wise = WeightedImportanceSamplingEstimator.create(
|
||||
self.io_context)
|
||||
self.reward_estimators.append(wise)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unknown evaluation method: {}".format(method))
|
||||
|
||||
if sample_async:
|
||||
self.sampler = AsyncSampler(
|
||||
self.async_env,
|
||||
self.policy_map,
|
||||
policy_mapping_fn,
|
||||
self.preprocessors,
|
||||
self.filters,
|
||||
clip_rewards,
|
||||
unroll_length,
|
||||
self.callbacks,
|
||||
horizon=episode_horizon,
|
||||
pack=pack_episodes,
|
||||
tf_sess=self.tf_sess,
|
||||
clip_actions=clip_actions,
|
||||
blackhole_outputs="simulation" in input_evaluation,
|
||||
soft_horizon=soft_horizon,
|
||||
no_done_at_end=no_done_at_end)
|
||||
self.sampler.start()
|
||||
else:
|
||||
self.sampler = SyncSampler(
|
||||
self.async_env,
|
||||
self.policy_map,
|
||||
policy_mapping_fn,
|
||||
self.preprocessors,
|
||||
self.filters,
|
||||
clip_rewards,
|
||||
unroll_length,
|
||||
self.callbacks,
|
||||
horizon=episode_horizon,
|
||||
pack=pack_episodes,
|
||||
tf_sess=self.tf_sess,
|
||||
clip_actions=clip_actions,
|
||||
soft_horizon=soft_horizon,
|
||||
no_done_at_end=no_done_at_end)
|
||||
|
||||
self.input_reader = input_creator(self.io_context)
|
||||
assert isinstance(self.input_reader, InputReader), self.input_reader
|
||||
self.output_writer = output_creator(self.io_context)
|
||||
assert isinstance(self.output_writer, OutputWriter), self.output_writer
|
||||
|
||||
logger.debug(
|
||||
"Created rollout worker with env {} ({}), policies {}".format(
|
||||
self.async_env, self.env, self.policy_map))
|
||||
|
||||
@override(EvaluatorInterface)
|
||||
def sample(self):
|
||||
"""Evaluate the current policies and return a batch of experiences.
|
||||
|
||||
Return:
|
||||
SampleBatch|MultiAgentBatch from evaluating the current policies.
|
||||
"""
|
||||
|
||||
if self._fake_sampler and self.last_batch is not None:
|
||||
return self.last_batch
|
||||
|
||||
if log_once("sample_start"):
|
||||
logger.info("Generating sample batch of size {}".format(
|
||||
self.sample_batch_size))
|
||||
|
||||
batches = [self.input_reader.next()]
|
||||
steps_so_far = batches[0].count
|
||||
|
||||
# In truncate_episodes mode, never pull more than 1 batch per env.
|
||||
# This avoids over-running the target batch size.
|
||||
if self.batch_mode == "truncate_episodes":
|
||||
max_batches = self.num_envs
|
||||
else:
|
||||
max_batches = float("inf")
|
||||
|
||||
while steps_so_far < self.sample_batch_size and len(
|
||||
batches) < max_batches:
|
||||
batch = self.input_reader.next()
|
||||
steps_so_far += batch.count
|
||||
batches.append(batch)
|
||||
batch = batches[0].concat_samples(batches)
|
||||
|
||||
if self.callbacks.get("on_sample_end"):
|
||||
self.callbacks["on_sample_end"]({"worker": self, "samples": batch})
|
||||
|
||||
# Always do writes prior to compression for consistency and to allow
|
||||
# for better compression inside the writer.
|
||||
self.output_writer.write(batch)
|
||||
|
||||
# Do off-policy estimation if needed
|
||||
if self.reward_estimators:
|
||||
for sub_batch in batch.split_by_episode():
|
||||
for estimator in self.reward_estimators:
|
||||
estimator.process(sub_batch)
|
||||
|
||||
if log_once("sample_end"):
|
||||
logger.info("Completed sample batch:\n\n{}\n".format(
|
||||
summarize(batch)))
|
||||
|
||||
if self.compress_observations == "bulk":
|
||||
batch.compress(bulk=True)
|
||||
elif self.compress_observations:
|
||||
batch.compress()
|
||||
|
||||
if self._fake_sampler:
|
||||
self.last_batch = batch
|
||||
return batch
|
||||
|
||||
@DeveloperAPI
|
||||
@ray.method(num_return_vals=2)
|
||||
def sample_with_count(self):
|
||||
"""Same as sample() but returns the count as a separate future."""
|
||||
batch = self.sample()
|
||||
return batch, batch.count
|
||||
|
||||
@override(EvaluatorInterface)
|
||||
def get_weights(self, policies=None):
|
||||
if policies is None:
|
||||
policies = self.policy_map.keys()
|
||||
return {
|
||||
pid: policy.get_weights()
|
||||
for pid, policy in self.policy_map.items() if pid in policies
|
||||
}
|
||||
|
||||
@override(EvaluatorInterface)
|
||||
def set_weights(self, weights):
|
||||
for pid, w in weights.items():
|
||||
self.policy_map[pid].set_weights(w)
|
||||
|
||||
@override(EvaluatorInterface)
|
||||
def compute_gradients(self, samples):
|
||||
if log_once("compute_gradients"):
|
||||
logger.info("Compute gradients on:\n\n{}\n".format(
|
||||
summarize(samples)))
|
||||
if isinstance(samples, MultiAgentBatch):
|
||||
grad_out, info_out = {}, {}
|
||||
if self.tf_sess is not None:
|
||||
builder = TFRunBuilder(self.tf_sess, "compute_gradients")
|
||||
for pid, batch in samples.policy_batches.items():
|
||||
if pid not in self.policies_to_train:
|
||||
continue
|
||||
grad_out[pid], info_out[pid] = (
|
||||
self.policy_map[pid]._build_compute_gradients(
|
||||
builder, batch))
|
||||
grad_out = {k: builder.get(v) for k, v in grad_out.items()}
|
||||
info_out = {k: builder.get(v) for k, v in info_out.items()}
|
||||
else:
|
||||
for pid, batch in samples.policy_batches.items():
|
||||
if pid not in self.policies_to_train:
|
||||
continue
|
||||
grad_out[pid], info_out[pid] = (
|
||||
self.policy_map[pid].compute_gradients(batch))
|
||||
else:
|
||||
grad_out, info_out = (
|
||||
self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples))
|
||||
info_out["batch_count"] = samples.count
|
||||
if log_once("grad_out"):
|
||||
logger.info("Compute grad info:\n\n{}\n".format(
|
||||
summarize(info_out)))
|
||||
return grad_out, info_out
|
||||
|
||||
@override(EvaluatorInterface)
|
||||
def apply_gradients(self, grads):
|
||||
if log_once("apply_gradients"):
|
||||
logger.info("Apply gradients:\n\n{}\n".format(summarize(grads)))
|
||||
if isinstance(grads, dict):
|
||||
if self.tf_sess is not None:
|
||||
builder = TFRunBuilder(self.tf_sess, "apply_gradients")
|
||||
outputs = {
|
||||
pid: self.policy_map[pid]._build_apply_gradients(
|
||||
builder, grad)
|
||||
for pid, grad in grads.items()
|
||||
}
|
||||
return {k: builder.get(v) for k, v in outputs.items()}
|
||||
else:
|
||||
return {
|
||||
pid: self.policy_map[pid].apply_gradients(g)
|
||||
for pid, g in grads.items()
|
||||
}
|
||||
else:
|
||||
return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads)
|
||||
|
||||
@override(EvaluatorInterface)
|
||||
def learn_on_batch(self, samples):
|
||||
if log_once("learn_on_batch"):
|
||||
logger.info(
|
||||
"Training on concatenated sample batches:\n\n{}\n".format(
|
||||
summarize(samples)))
|
||||
if isinstance(samples, MultiAgentBatch):
|
||||
info_out = {}
|
||||
to_fetch = {}
|
||||
if self.tf_sess is not None:
|
||||
builder = TFRunBuilder(self.tf_sess, "learn_on_batch")
|
||||
else:
|
||||
builder = None
|
||||
for pid, batch in samples.policy_batches.items():
|
||||
if pid not in self.policies_to_train:
|
||||
continue
|
||||
policy = self.policy_map[pid]
|
||||
if builder and hasattr(policy, "_build_learn_on_batch"):
|
||||
to_fetch[pid] = policy._build_learn_on_batch(
|
||||
builder, batch)
|
||||
else:
|
||||
info_out[pid] = policy.learn_on_batch(batch)
|
||||
info_out.update({k: builder.get(v) for k, v in to_fetch.items()})
|
||||
else:
|
||||
info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch(
|
||||
samples)
|
||||
if log_once("learn_out"):
|
||||
logger.info("Training output:\n\n{}\n".format(summarize(info_out)))
|
||||
return info_out
|
||||
|
||||
@DeveloperAPI
|
||||
def get_metrics(self):
|
||||
"""Returns a list of new RolloutMetric objects from evaluation."""
|
||||
|
||||
out = self.sampler.get_metrics()
|
||||
for m in self.reward_estimators:
|
||||
out.extend(m.get_metrics())
|
||||
return out
|
||||
|
||||
@DeveloperAPI
|
||||
def foreach_env(self, func):
|
||||
"""Apply the given function to each underlying env instance."""
|
||||
|
||||
envs = self.async_env.get_unwrapped()
|
||||
if not envs:
|
||||
return [func(self.async_env)]
|
||||
else:
|
||||
return [func(e) for e in envs]
|
||||
|
||||
@DeveloperAPI
|
||||
def get_policy(self, policy_id=DEFAULT_POLICY_ID):
|
||||
"""Return policy for the specified id, or None.
|
||||
|
||||
Arguments:
|
||||
policy_id (str): id of policy to return.
|
||||
"""
|
||||
|
||||
return self.policy_map.get(policy_id)
|
||||
|
||||
@DeveloperAPI
|
||||
def for_policy(self, func, policy_id=DEFAULT_POLICY_ID):
|
||||
"""Apply the given function to the specified policy."""
|
||||
|
||||
return func(self.policy_map[policy_id])
|
||||
|
||||
@DeveloperAPI
|
||||
def foreach_policy(self, func):
|
||||
"""Apply the given function to each (policy, policy_id) tuple."""
|
||||
|
||||
return [func(policy, pid) for pid, policy in self.policy_map.items()]
|
||||
|
||||
@DeveloperAPI
|
||||
def foreach_trainable_policy(self, func):
|
||||
"""Apply the given function to each (policy, policy_id) tuple.
|
||||
|
||||
This only applies func to policies in `self.policies_to_train`."""
|
||||
|
||||
return [
|
||||
func(policy, pid) for pid, policy in self.policy_map.items()
|
||||
if pid in self.policies_to_train
|
||||
]
|
||||
|
||||
@DeveloperAPI
|
||||
def sync_filters(self, new_filters):
|
||||
"""Changes self's filter to given and rebases any accumulated delta.
|
||||
|
||||
Args:
|
||||
new_filters (dict): Filters with new state to update local copy.
|
||||
"""
|
||||
assert all(k in new_filters for k in self.filters)
|
||||
for k in self.filters:
|
||||
self.filters[k].sync(new_filters[k])
|
||||
|
||||
@DeveloperAPI
|
||||
def get_filters(self, flush_after=False):
|
||||
"""Returns a snapshot of filters.
|
||||
|
||||
Args:
|
||||
flush_after (bool): Clears the filter buffer state.
|
||||
|
||||
Returns:
|
||||
return_filters (dict): Dict for serializable filters
|
||||
"""
|
||||
return_filters = {}
|
||||
for k, f in self.filters.items():
|
||||
return_filters[k] = f.as_serializable()
|
||||
if flush_after:
|
||||
f.clear_buffer()
|
||||
return return_filters
|
||||
|
||||
@DeveloperAPI
|
||||
def save(self):
|
||||
filters = self.get_filters(flush_after=True)
|
||||
state = {
|
||||
pid: self.policy_map[pid].get_state()
|
||||
for pid in self.policy_map
|
||||
}
|
||||
return pickle.dumps({"filters": filters, "state": state})
|
||||
|
||||
@DeveloperAPI
|
||||
def restore(self, objs):
|
||||
objs = pickle.loads(objs)
|
||||
self.sync_filters(objs["filters"])
|
||||
for pid, state in objs["state"].items():
|
||||
self.policy_map[pid].set_state(state)
|
||||
|
||||
@DeveloperAPI
|
||||
def set_global_vars(self, global_vars):
|
||||
self.foreach_policy(lambda p, _: p.on_global_var_update(global_vars))
|
||||
|
||||
@DeveloperAPI
|
||||
def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID):
|
||||
self.policy_map[policy_id].export_model(export_dir)
|
||||
|
||||
@DeveloperAPI
|
||||
def export_policy_checkpoint(self,
|
||||
export_dir,
|
||||
filename_prefix="model",
|
||||
policy_id=DEFAULT_POLICY_ID):
|
||||
self.policy_map[policy_id].export_checkpoint(export_dir,
|
||||
filename_prefix)
|
||||
|
||||
@DeveloperAPI
|
||||
def stop(self):
|
||||
self.async_env.stop()
|
||||
|
||||
def _build_policy_map(self, policy_dict, policy_config):
|
||||
policy_map = {}
|
||||
preprocessors = {}
|
||||
for name, (cls, obs_space, act_space,
|
||||
conf) in sorted(policy_dict.items()):
|
||||
logger.debug("Creating policy for {}".format(name))
|
||||
merged_conf = merge_dicts(policy_config, conf)
|
||||
if self.preprocessing_enabled:
|
||||
preprocessor = ModelCatalog.get_preprocessor_for_space(
|
||||
obs_space, merged_conf.get("model"))
|
||||
preprocessors[name] = preprocessor
|
||||
obs_space = preprocessor.observation_space
|
||||
else:
|
||||
preprocessors[name] = NoPreprocessor(obs_space)
|
||||
if isinstance(obs_space, gym.spaces.Dict) or \
|
||||
isinstance(obs_space, gym.spaces.Tuple):
|
||||
raise ValueError(
|
||||
"Found raw Tuple|Dict space as input to policy. "
|
||||
"Please preprocess these observations with a "
|
||||
"Tuple|DictFlatteningPreprocessor.")
|
||||
if tf:
|
||||
with tf.variable_scope(name):
|
||||
policy_map[name] = cls(obs_space, act_space, merged_conf)
|
||||
else:
|
||||
policy_map[name] = cls(obs_space, act_space, merged_conf)
|
||||
if self.worker_index == 0:
|
||||
logger.info("Built policy map: {}".format(policy_map))
|
||||
logger.info("Built preprocessor map: {}".format(preprocessors))
|
||||
return policy_map, preprocessors
|
||||
|
||||
def __del__(self):
|
||||
if hasattr(self, "sampler") and isinstance(self.sampler, AsyncSampler):
|
||||
self.sampler.shutdown = True
|
||||
|
||||
|
||||
def _validate_and_canonicalize(policy, env):
|
||||
if isinstance(policy, dict):
|
||||
_validate_multiagent_config(policy)
|
||||
return policy
|
||||
elif not issubclass(policy, Policy):
|
||||
raise ValueError("policy must be a rllib.Policy class")
|
||||
else:
|
||||
if (isinstance(env, MultiAgentEnv)
|
||||
and not hasattr(env, "observation_space")):
|
||||
raise ValueError(
|
||||
"MultiAgentEnv must have observation_space defined if run "
|
||||
"in a single-agent configuration.")
|
||||
return {
|
||||
DEFAULT_POLICY_ID: (policy, env.observation_space,
|
||||
env.action_space, {})
|
||||
}
|
||||
|
||||
|
||||
def _validate_multiagent_config(policy, allow_none_graph=False):
|
||||
for k, v in policy.items():
|
||||
if not isinstance(k, str):
|
||||
raise ValueError("policy keys must be strs, got {}".format(
|
||||
type(k)))
|
||||
if not isinstance(v, (tuple, list)) or len(v) != 4:
|
||||
raise ValueError(
|
||||
"policy values must be tuples/lists of "
|
||||
"(cls or None, obs_space, action_space, config), got {}".
|
||||
format(v))
|
||||
if allow_none_graph and v[0] is None:
|
||||
pass
|
||||
elif not issubclass(v[0], Policy):
|
||||
raise ValueError("policy tuple value 0 must be a rllib.Policy "
|
||||
"class or None, got {}".format(v[0]))
|
||||
if not isinstance(v[1], gym.Space):
|
||||
raise ValueError(
|
||||
"policy tuple value 1 (observation_space) must be a "
|
||||
"gym.Space, got {}".format(type(v[1])))
|
||||
if not isinstance(v[2], gym.Space):
|
||||
raise ValueError("policy tuple value 2 (action_space) must be a "
|
||||
"gym.Space, got {}".format(type(v[2])))
|
||||
if not isinstance(v[3], dict):
|
||||
raise ValueError("policy tuple value 3 (config) must be a dict, "
|
||||
"got {}".format(type(v[3])))
|
||||
|
||||
|
||||
def _validate_env(env):
|
||||
# allow this as a special case (assumed gym.Env)
|
||||
if hasattr(env, "observation_space") and hasattr(env, "action_space"):
|
||||
return env
|
||||
|
||||
allowed_types = [gym.Env, MultiAgentEnv, ExternalEnv, VectorEnv, BaseEnv]
|
||||
if not any(isinstance(env, tpe) for tpe in allowed_types):
|
||||
raise ValueError(
|
||||
"Returned env should be an instance of gym.Env, MultiAgentEnv, "
|
||||
"ExternalEnv, VectorEnv, or BaseEnv. The provided env creator "
|
||||
"function returned {} ({}).".format(env, type(env)))
|
||||
return env
|
||||
|
||||
|
||||
def _has_tensorflow_graph(policy_dict):
|
||||
for policy, _, _, _ in policy_dict.values():
|
||||
if issubclass(policy, TFPolicy):
|
||||
return True
|
||||
return False
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user