mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:53:14 +08:00
b948405532
* patch up pbt * Sat Jan 27 01:00:03 PST 2018 * Sat Jan 27 01:04:14 PST 2018 * Sat Jan 27 01:04:21 PST 2018 * Sat Jan 27 01:15:15 PST 2018 * Sat Jan 27 01:15:42 PST 2018 * Sat Jan 27 01:16:14 PST 2018 * Sat Jan 27 01:38:42 PST 2018 * Sat Jan 27 01:39:21 PST 2018 * add pbt * Sat Jan 27 01:41:19 PST 2018 * Sat Jan 27 01:44:21 PST 2018 * Sat Jan 27 01:45:46 PST 2018 * Sat Jan 27 16:54:42 PST 2018 * Sat Jan 27 16:57:53 PST 2018 * clean up test * Sat Jan 27 18:01:15 PST 2018 * Sat Jan 27 18:02:54 PST 2018 * Sat Jan 27 18:11:18 PST 2018 * Sat Jan 27 18:11:55 PST 2018 * Sat Jan 27 18:14:09 PST 2018 * review * try out a ppo example * some tweaks to ppo example * add postprocess hook * Sun Jan 28 15:00:40 PST 2018 * clean up custom explore fn * Sun Jan 28 15:10:21 PST 2018 * Sun Jan 28 15:14:53 PST 2018 * Sun Jan 28 15:17:04 PST 2018 * Sun Jan 28 15:33:13 PST 2018 * Sun Jan 28 15:56:40 PST 2018 * Sun Jan 28 15:57:36 PST 2018 * Sun Jan 28 16:00:35 PST 2018 * Sun Jan 28 16:02:58 PST 2018 * Sun Jan 28 16:29:50 PST 2018 * Sun Jan 28 16:30:36 PST 2018 * Sun Jan 28 16:31:44 PST 2018 * improve tune doc * concepts * update humanoid * Fri Feb 2 18:03:33 PST 2018 * fix example * show error file
232 lines
8.9 KiB
Python
232 lines
8.9 KiB
Python
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import pickle
|
|
import tensorflow as tf
|
|
import os
|
|
|
|
from tensorflow.python import debug as tf_debug
|
|
|
|
import numpy as np
|
|
|
|
import ray
|
|
from ray.rllib.optimizers import Evaluator, SampleBatch
|
|
from ray.rllib.optimizers.multi_gpu_impl import LocalSyncParallelOptimizer
|
|
from ray.rllib.models import ModelCatalog
|
|
from ray.rllib.utils.sampler import SyncSampler
|
|
from ray.rllib.utils.filter import get_filter, MeanStdFilter
|
|
from ray.rllib.utils.process_rollout import process_rollout
|
|
from ray.rllib.ppo.loss import ProximalPolicyLoss
|
|
|
|
|
|
# TODO(rliaw): Move this onto LocalMultiGPUOptimizer
|
|
class PPOEvaluator(Evaluator):
|
|
"""
|
|
Runner class that holds the simulator environment and the policy.
|
|
|
|
Initializes the tensorflow graphs for both training and evaluation.
|
|
One common policy graph is initialized on '/cpu:0' and holds all the shared
|
|
network weights. When run as a remote agent, only this graph is used.
|
|
"""
|
|
|
|
def __init__(self, registry, env_creator, config, logdir, is_remote):
|
|
self.registry = registry
|
|
self.is_remote = is_remote
|
|
if is_remote:
|
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
|
devices = ["/cpu:0"]
|
|
else:
|
|
devices = config["devices"]
|
|
self.devices = devices
|
|
self.config = config
|
|
self.logdir = logdir
|
|
self.env = ModelCatalog.get_preprocessor_as_wrapper(
|
|
registry, env_creator(config["env_config"]), config["model"])
|
|
if is_remote:
|
|
config_proto = tf.ConfigProto()
|
|
else:
|
|
config_proto = tf.ConfigProto(**config["tf_session_args"])
|
|
self.sess = tf.Session(config=config_proto)
|
|
if config["tf_debug_inf_or_nan"] and not is_remote:
|
|
self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess)
|
|
self.sess.add_tensor_filter(
|
|
"has_inf_or_nan", tf_debug.has_inf_or_nan)
|
|
|
|
# Defines the training inputs:
|
|
# The coefficient of the KL penalty.
|
|
self.kl_coeff = tf.placeholder(
|
|
name="newkl", shape=(), dtype=tf.float32)
|
|
|
|
# The input observations.
|
|
self.observations = tf.placeholder(
|
|
tf.float32, shape=(None,) + self.env.observation_space.shape)
|
|
# Targets of the value function.
|
|
self.value_targets = tf.placeholder(tf.float32, shape=(None,))
|
|
# Advantage values in the policy gradient estimator.
|
|
self.advantages = tf.placeholder(tf.float32, shape=(None,))
|
|
|
|
action_space = self.env.action_space
|
|
self.actions = ModelCatalog.get_action_placeholder(action_space)
|
|
self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist(
|
|
action_space)
|
|
# Log probabilities from the policy before the policy update.
|
|
self.prev_logits = tf.placeholder(
|
|
tf.float32, shape=(None, self.logit_dim))
|
|
# Value function predictions before the policy update.
|
|
self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,))
|
|
|
|
if is_remote:
|
|
self.batch_size = config["rollout_batchsize"]
|
|
self.per_device_batch_size = config["rollout_batchsize"]
|
|
else:
|
|
self.batch_size = int(
|
|
config["sgd_batchsize"] / len(devices)) * len(devices)
|
|
assert self.batch_size % len(devices) == 0
|
|
self.per_device_batch_size = int(self.batch_size / len(devices))
|
|
|
|
def build_loss(obs, vtargets, advs, acts, plog, pvf_preds):
|
|
return ProximalPolicyLoss(
|
|
self.env.observation_space, self.env.action_space,
|
|
obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim,
|
|
self.kl_coeff, self.distribution_class, self.config,
|
|
self.sess, self.registry)
|
|
|
|
self.par_opt = LocalSyncParallelOptimizer(
|
|
tf.train.AdamOptimizer(self.config["sgd_stepsize"]),
|
|
self.devices,
|
|
[self.observations, self.value_targets, self.advantages,
|
|
self.actions, self.prev_logits, self.prev_vf_preds],
|
|
self.per_device_batch_size,
|
|
build_loss,
|
|
self.logdir)
|
|
|
|
# Metric ops
|
|
with tf.name_scope("test_outputs"):
|
|
policies = self.par_opt.get_device_losses()
|
|
self.mean_loss = tf.reduce_mean(
|
|
tf.stack(values=[
|
|
policy.loss for policy in policies]), 0)
|
|
self.mean_policy_loss = tf.reduce_mean(
|
|
tf.stack(values=[
|
|
policy.mean_policy_loss for policy in policies]), 0)
|
|
self.mean_vf_loss = tf.reduce_mean(
|
|
tf.stack(values=[
|
|
policy.mean_vf_loss for policy in policies]), 0)
|
|
self.mean_kl = tf.reduce_mean(
|
|
tf.stack(values=[
|
|
policy.mean_kl for policy in policies]), 0)
|
|
self.mean_entropy = tf.reduce_mean(
|
|
tf.stack(values=[
|
|
policy.mean_entropy for policy in policies]), 0)
|
|
|
|
# References to the model weights
|
|
self.common_policy = self.par_opt.get_common_loss()
|
|
self.variables = ray.experimental.TensorFlowVariables(
|
|
self.common_policy.loss, self.sess)
|
|
self.obs_filter = get_filter(
|
|
config["observation_filter"], self.env.observation_space.shape)
|
|
self.rew_filter = MeanStdFilter((), clip=5.0)
|
|
self.filters = {"obs_filter": self.obs_filter,
|
|
"rew_filter": self.rew_filter}
|
|
self.sampler = SyncSampler(
|
|
self.env, self.common_policy, self.obs_filter,
|
|
self.config["horizon"], self.config["horizon"])
|
|
self.sess.run(tf.global_variables_initializer())
|
|
|
|
def load_data(self, trajectories, full_trace):
|
|
use_gae = self.config["use_gae"]
|
|
dummy = np.zeros_like(trajectories["advantages"])
|
|
return self.par_opt.load_data(
|
|
self.sess,
|
|
[trajectories["observations"],
|
|
trajectories["value_targets"] if use_gae else dummy,
|
|
trajectories["advantages"],
|
|
trajectories["actions"],
|
|
trajectories["logprobs"],
|
|
trajectories["vf_preds"] if use_gae else dummy],
|
|
full_trace=full_trace)
|
|
|
|
def run_sgd_minibatch(
|
|
self, batch_index, kl_coeff, full_trace, file_writer):
|
|
return self.par_opt.optimize(
|
|
self.sess,
|
|
batch_index,
|
|
extra_ops=[
|
|
self.mean_loss, self.mean_policy_loss, self.mean_vf_loss,
|
|
self.mean_kl, self.mean_entropy],
|
|
extra_feed_dict={self.kl_coeff: kl_coeff},
|
|
file_writer=file_writer if full_trace else None)
|
|
|
|
def compute_gradients(self, samples):
|
|
raise NotImplementedError
|
|
|
|
def apply_gradients(self, grads):
|
|
raise NotImplementedError
|
|
|
|
def save(self):
|
|
filters = self.get_filters(flush_after=True)
|
|
return pickle.dumps({"filters": filters})
|
|
|
|
def restore(self, objs):
|
|
objs = pickle.loads(objs)
|
|
self.sync_filters(objs["filters"])
|
|
|
|
def get_weights(self):
|
|
return self.variables.get_weights()
|
|
|
|
def set_weights(self, weights):
|
|
self.variables.set_weights(weights)
|
|
|
|
def sample(self):
|
|
"""Returns experience samples from this Evaluator. Observation
|
|
filter and reward filters are flushed here.
|
|
|
|
Returns:
|
|
SampleBatch: A columnar batch of experiences.
|
|
"""
|
|
num_steps_so_far = 0
|
|
all_samples = []
|
|
|
|
while num_steps_so_far < self.config["min_steps_per_task"]:
|
|
rollout = self.sampler.get_data()
|
|
samples = process_rollout(
|
|
rollout, self.rew_filter, self.config["gamma"],
|
|
self.config["lambda"], use_gae=self.config["use_gae"])
|
|
num_steps_so_far += samples.count
|
|
all_samples.append(samples)
|
|
return SampleBatch.concat_samples(all_samples)
|
|
|
|
def get_completed_rollout_metrics(self):
|
|
"""Returns metrics on previously completed rollouts.
|
|
|
|
Calling this clears the queue of completed rollout metrics.
|
|
"""
|
|
return self.sampler.get_metrics()
|
|
|
|
def sync_filters(self, new_filters):
|
|
"""Changes self's filter to given and rebases any accumulated delta.
|
|
|
|
Args:
|
|
new_filters (dict): Filters with new state to update local copy.
|
|
"""
|
|
assert all(k in new_filters for k in self.filters)
|
|
for k in self.filters:
|
|
self.filters[k].sync(new_filters[k])
|
|
|
|
def get_filters(self, flush_after=False):
|
|
"""Returns a snapshot of filters.
|
|
|
|
Args:
|
|
flush_after (bool): Clears the filter buffer state.
|
|
|
|
Returns:
|
|
return_filters (dict): Dict for serializable filters
|
|
"""
|
|
return_filters = {}
|
|
for k, f in self.filters.items():
|
|
return_filters[k] = f.as_serializable()
|
|
if flush_after:
|
|
f.clear_buffer()
|
|
return return_filters
|