mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 03:34:48 +08:00
@@ -17,9 +17,10 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
|
||||
|
||||
|
||||
def _register_all():
|
||||
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
|
||||
"APEX_DDPG", "__fake", "__sigmoid_fake_data",
|
||||
"__parameter_tuning"]:
|
||||
for key in [
|
||||
"PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", "APEX_DDPG",
|
||||
"__fake", "__sigmoid_fake_data", "__parameter_tuning"
|
||||
]:
|
||||
from ray.rllib.agents.agent import get_agent_class
|
||||
register_trainable(key, get_agent_class(key))
|
||||
|
||||
@@ -27,6 +28,12 @@ def _register_all():
|
||||
_register_all()
|
||||
|
||||
__all__ = [
|
||||
"PolicyGraph", "TFPolicyGraph", "PolicyEvaluator", "SampleBatch",
|
||||
"AsyncVectorEnv", "MultiAgentEnv", "VectorEnv", "ServingEnv",
|
||||
"PolicyGraph",
|
||||
"TFPolicyGraph",
|
||||
"PolicyEvaluator",
|
||||
"SampleBatch",
|
||||
"AsyncVectorEnv",
|
||||
"MultiAgentEnv",
|
||||
"VectorEnv",
|
||||
"ServingEnv",
|
||||
]
|
||||
|
||||
@@ -92,15 +92,15 @@ class A3CAgent(Agent):
|
||||
self.remote_evaluators = self.make_remote_evaluators(
|
||||
self.env_creator, policy_cls, self.config["num_workers"],
|
||||
{"num_gpus": 1 if self.config["use_gpu_for_workers"] else 0})
|
||||
self.optimizer = AsyncGradientsOptimizer(
|
||||
self.local_evaluator, self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
|
||||
self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
|
||||
def _train(self):
|
||||
prev_steps = self.optimizer.num_steps_sampled
|
||||
self.optimizer.step()
|
||||
FilterManager.synchronize(
|
||||
self.local_evaluator.filters, self.remote_evaluators)
|
||||
FilterManager.synchronize(self.local_evaluator.filters,
|
||||
self.remote_evaluators)
|
||||
result = self.optimizer.collect_metrics()
|
||||
result = result._replace(
|
||||
timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps)
|
||||
|
||||
@@ -14,19 +14,23 @@ from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
|
||||
class A3CLoss(object):
|
||||
def __init__(
|
||||
self, action_dist, actions, advantages, v_target, vf,
|
||||
vf_loss_coeff=0.5, entropy_coeff=-0.01):
|
||||
def __init__(self,
|
||||
action_dist,
|
||||
actions,
|
||||
advantages,
|
||||
v_target,
|
||||
vf,
|
||||
vf_loss_coeff=0.5,
|
||||
entropy_coeff=-0.01):
|
||||
log_prob = action_dist.logp(actions)
|
||||
|
||||
# The "policy gradients" loss
|
||||
self.pi_loss = - tf.reduce_sum(log_prob * advantages)
|
||||
self.pi_loss = -tf.reduce_sum(log_prob * advantages)
|
||||
|
||||
delta = vf - v_target
|
||||
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
|
||||
self.entropy = tf.reduce_sum(action_dist.entropy())
|
||||
self.total_loss = (self.pi_loss +
|
||||
self.vf_loss * vf_loss_coeff +
|
||||
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
|
||||
self.entropy * entropy_coeff)
|
||||
|
||||
|
||||
@@ -41,8 +45,8 @@ class A3CPolicyGraph(TFPolicyGraph):
|
||||
tf.float32, [None] + list(observation_space.shape))
|
||||
dist_class, logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space, self.config["model"])
|
||||
self.model = ModelCatalog.get_model(
|
||||
self.observations, logit_dim, self.config["model"])
|
||||
self.model = ModelCatalog.get_model(self.observations, logit_dim,
|
||||
self.config["model"])
|
||||
action_dist = dist_class(self.model.outputs)
|
||||
self.vf = tf.reshape(
|
||||
linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
|
||||
@@ -62,9 +66,9 @@ class A3CPolicyGraph(TFPolicyGraph):
|
||||
action_space))
|
||||
advantages = tf.placeholder(tf.float32, [None], name="advantages")
|
||||
v_target = tf.placeholder(tf.float32, [None], name="v_target")
|
||||
self.loss = A3CLoss(
|
||||
action_dist, actions, advantages, v_target, self.vf,
|
||||
self.config["vf_loss_coeff"], self.config["entropy_coeff"])
|
||||
self.loss = A3CLoss(action_dist, actions, advantages, v_target,
|
||||
self.vf, self.config["vf_loss_coeff"],
|
||||
self.config["entropy_coeff"])
|
||||
|
||||
# Initialize TFPolicyGraph
|
||||
loss_in = [
|
||||
@@ -76,10 +80,16 @@ class A3CPolicyGraph(TFPolicyGraph):
|
||||
self.state_in = self.model.state_in
|
||||
self.state_out = self.model.state_out
|
||||
TFPolicyGraph.__init__(
|
||||
self, observation_space, action_space, self.sess,
|
||||
obs_input=self.observations, action_sampler=action_dist.sample(),
|
||||
loss=self.loss.total_loss, loss_inputs=loss_in,
|
||||
state_inputs=self.state_in, state_outputs=self.state_out,
|
||||
self,
|
||||
observation_space,
|
||||
action_space,
|
||||
self.sess,
|
||||
obs_input=self.observations,
|
||||
action_sampler=action_dist.sample(),
|
||||
loss=self.loss.total_loss,
|
||||
loss_inputs=loss_in,
|
||||
state_inputs=self.state_in,
|
||||
state_outputs=self.state_out,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=self.config["model"]["max_seq_len"])
|
||||
|
||||
@@ -132,5 +142,5 @@ class A3CPolicyGraph(TFPolicyGraph):
|
||||
for i in range(len(self.state_in)):
|
||||
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
|
||||
last_r = self.value(sample_batch["new_obs"][-1], *next_state)
|
||||
return compute_advantages(
|
||||
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
|
||||
return compute_advantages(sample_batch, last_r, self.config["gamma"],
|
||||
self.config["lambda"])
|
||||
|
||||
@@ -46,20 +46,21 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
|
||||
action_space, self.config["model"])
|
||||
self.model = ModelCatalog.get_torch_model(
|
||||
obs_space.shape, self.logit_dim, self.config["model"])
|
||||
loss = A3CLoss(
|
||||
self.model, self.config["vf_loss_coeff"],
|
||||
self.config["entropy_coeff"])
|
||||
loss = A3CLoss(self.model, self.config["vf_loss_coeff"],
|
||||
self.config["entropy_coeff"])
|
||||
TorchPolicyGraph.__init__(
|
||||
self, obs_space, action_space, self.model, loss,
|
||||
loss_inputs=[
|
||||
"obs", "actions", "advantages", "value_targets"])
|
||||
self,
|
||||
obs_space,
|
||||
action_space,
|
||||
self.model,
|
||||
loss,
|
||||
loss_inputs=["obs", "actions", "advantages", "value_targets"])
|
||||
|
||||
def extra_action_out(self, model_out):
|
||||
return {"vf_preds": var_to_np(model_out[1])}
|
||||
|
||||
def optimizer(self):
|
||||
return torch.optim.Adam(
|
||||
self.model.parameters(), lr=self.config["lr"])
|
||||
return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])
|
||||
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
completed = sample_batch["dones"][-1]
|
||||
@@ -67,8 +68,8 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
|
||||
last_r = 0.0
|
||||
else:
|
||||
last_r = self._value(sample_batch["new_obs"][-1])
|
||||
return compute_advantages(
|
||||
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
|
||||
return compute_advantages(sample_batch, last_r, self.config["gamma"],
|
||||
self.config["lambda"])
|
||||
|
||||
def _value(self, obs):
|
||||
with self.lock:
|
||||
|
||||
@@ -47,7 +47,9 @@ COMMON_CONFIG = {
|
||||
"allow_growth": True,
|
||||
},
|
||||
"log_device_placement": False,
|
||||
"device_count": {"CPU": 1},
|
||||
"device_count": {
|
||||
"CPU": 1
|
||||
},
|
||||
"allow_soft_placement": True, # required by PPO multi-gpu
|
||||
},
|
||||
# Whether to LZ4 compress observations
|
||||
@@ -86,8 +88,7 @@ def _deep_update(original, new_dict, new_keys_allowed, whitelist):
|
||||
for k, value in new_dict.items():
|
||||
if k not in original and k != "env":
|
||||
if not new_keys_allowed:
|
||||
raise Exception(
|
||||
"Unknown config parameter `{}` ".format(k))
|
||||
raise Exception("Unknown config parameter `{}` ".format(k))
|
||||
if type(original.get(k)) is dict:
|
||||
if k in whitelist:
|
||||
_deep_update(original[k], value, True, [])
|
||||
@@ -112,22 +113,24 @@ class Agent(Trainable):
|
||||
|
||||
_allow_unknown_configs = False
|
||||
_allow_unknown_subkeys = [
|
||||
"tf_session_args", "env_config", "model", "optimizer", "multiagent"]
|
||||
"tf_session_args", "env_config", "model", "optimizer", "multiagent"
|
||||
]
|
||||
|
||||
def make_local_evaluator(self, env_creator, policy_graph):
|
||||
"""Convenience method to return configured local evaluator."""
|
||||
|
||||
return self._make_evaluator(
|
||||
PolicyEvaluator, env_creator, policy_graph, 0)
|
||||
return self._make_evaluator(PolicyEvaluator, env_creator, policy_graph,
|
||||
0)
|
||||
|
||||
def make_remote_evaluators(
|
||||
self, env_creator, policy_graph, count, remote_args):
|
||||
def make_remote_evaluators(self, env_creator, policy_graph, count,
|
||||
remote_args):
|
||||
"""Convenience method to return a number of remote evaluators."""
|
||||
|
||||
cls = PolicyEvaluator.as_remote(**remote_args).remote
|
||||
return [
|
||||
self._make_evaluator(cls, env_creator, policy_graph, i+1)
|
||||
for i in range(count)]
|
||||
self._make_evaluator(cls, env_creator, policy_graph, i + 1)
|
||||
for i in range(count)
|
||||
]
|
||||
|
||||
def _make_evaluator(self, cls, env_creator, policy_graph, worker_index):
|
||||
config = self.config
|
||||
@@ -140,8 +143,8 @@ class Agent(Trainable):
|
||||
env_creator,
|
||||
self.config["multiagent"]["policy_graphs"] or policy_graph,
|
||||
policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
|
||||
tf_session_creator=(
|
||||
session_creator if config["tf_session_args"] else None),
|
||||
tf_session_creator=(session_creator
|
||||
if config["tf_session_args"] else None),
|
||||
batch_steps=config["sample_batch_size"],
|
||||
batch_mode=config["batch_mode"],
|
||||
episode_horizon=config["horizon"],
|
||||
@@ -157,14 +160,12 @@ class Agent(Trainable):
|
||||
|
||||
@classmethod
|
||||
def resource_help(cls, config):
|
||||
return (
|
||||
"\n\nYou can adjust the resource requests of RLlib agents by "
|
||||
"setting `num_workers` and other configs. See the "
|
||||
"DEFAULT_CONFIG defined by each agent for more info.\n\n"
|
||||
"The config of this agent is: " + json.dumps(config))
|
||||
return ("\n\nYou can adjust the resource requests of RLlib agents by "
|
||||
"setting `num_workers` and other configs. See the "
|
||||
"DEFAULT_CONFIG defined by each agent for more info.\n\n"
|
||||
"The config of this agent is: " + json.dumps(config))
|
||||
|
||||
def __init__(
|
||||
self, config=None, env=None, logger_creator=None):
|
||||
def __init__(self, config=None, env=None, logger_creator=None):
|
||||
"""Initialize an RLLib agent.
|
||||
|
||||
Args:
|
||||
@@ -235,8 +236,8 @@ class Agent(Trainable):
|
||||
obs = self.local_evaluator.filters["default"](
|
||||
observation, update=False)
|
||||
return self.local_evaluator.for_policy(
|
||||
lambda p: p.compute_single_action(
|
||||
obs, state, is_training=False)[0])
|
||||
lambda p: p.compute_single_action(obs, state, is_training=False)[0]
|
||||
)
|
||||
|
||||
|
||||
class _MockAgent(Agent):
|
||||
@@ -257,8 +258,10 @@ class _MockAgent(Agent):
|
||||
and (self.config["persistent_error"] or not self.restored):
|
||||
raise Exception("mock error")
|
||||
return TrainingResult(
|
||||
episode_reward_mean=10, episode_len_mean=10,
|
||||
timesteps_this_iter=10, info={})
|
||||
episode_reward_mean=10,
|
||||
episode_len_mean=10,
|
||||
timesteps_this_iter=10,
|
||||
info={})
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
path = os.path.join(checkpoint_dir, "mock_agent.pkl")
|
||||
@@ -299,9 +302,11 @@ class _SigmoidFakeData(_MockAgent):
|
||||
v = np.tanh(float(i) / self.config["width"])
|
||||
v *= self.config["height"]
|
||||
return TrainingResult(
|
||||
episode_reward_mean=v, episode_len_mean=v,
|
||||
episode_reward_mean=v,
|
||||
episode_len_mean=v,
|
||||
timesteps_this_iter=self.config["iter_timesteps"],
|
||||
time_this_iter_s=self.config["iter_time"], info={})
|
||||
time_this_iter_s=self.config["iter_time"],
|
||||
info={})
|
||||
|
||||
|
||||
class _ParameterTuningAgent(_MockAgent):
|
||||
@@ -320,7 +325,8 @@ class _ParameterTuningAgent(_MockAgent):
|
||||
episode_reward_mean=self.config["reward_amt"] * self.iteration,
|
||||
episode_len_mean=self.config["reward_amt"],
|
||||
timesteps_this_iter=self.config["iter_timesteps"],
|
||||
time_this_iter_s=self.config["iter_time"], info={})
|
||||
time_this_iter_s=self.config["iter_time"],
|
||||
info={})
|
||||
|
||||
|
||||
def get_agent_class(alg):
|
||||
@@ -363,5 +369,4 @@ def get_agent_class(alg):
|
||||
elif alg == "__parameter_tuning":
|
||||
return _ParameterTuningAgent
|
||||
else:
|
||||
raise Exception(
|
||||
("Unknown algorithm {}.").format(alg))
|
||||
raise Exception(("Unknown algorithm {}.").format(alg))
|
||||
|
||||
@@ -57,28 +57,31 @@ class BCAgent(Agent):
|
||||
else:
|
||||
num_gpus_per_worker = 0
|
||||
return Resources(
|
||||
cpu=1, gpu=cf["gpu"] and 1 or 0,
|
||||
cpu=1,
|
||||
gpu=cf["gpu"] and 1 or 0,
|
||||
extra_cpu=cf["num_workers"],
|
||||
extra_gpu=num_gpus_per_worker * cf["num_workers"])
|
||||
|
||||
def _init(self):
|
||||
self.local_evaluator = BCEvaluator(
|
||||
self.env_creator, self.config, self.logdir)
|
||||
self.local_evaluator = BCEvaluator(self.env_creator, self.config,
|
||||
self.logdir)
|
||||
if self.config["use_gpu_for_workers"]:
|
||||
remote_cls = GPURemoteBCEvaluator
|
||||
else:
|
||||
remote_cls = RemoteBCEvaluator
|
||||
self.remote_evaluators = [
|
||||
remote_cls.remote(self.env_creator, self.config, self.logdir)
|
||||
for _ in range(self.config["num_workers"])]
|
||||
self.optimizer = AsyncGradientsOptimizer(
|
||||
self.local_evaluator, self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
for _ in range(self.config["num_workers"])
|
||||
]
|
||||
self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
|
||||
self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
|
||||
def _train(self):
|
||||
self.optimizer.step()
|
||||
metric_lists = [re.get_metrics.remote() for re in
|
||||
self.remote_evaluators]
|
||||
metric_lists = [
|
||||
re.get_metrics.remote() for re in self.remote_evaluators
|
||||
]
|
||||
total_samples = 0
|
||||
total_loss = 0
|
||||
for metrics in metric_lists:
|
||||
|
||||
@@ -14,8 +14,8 @@ from ray.rllib.models import ModelCatalog
|
||||
|
||||
class BCEvaluator(EvaluatorInterface):
|
||||
def __init__(self, env_creator, config, logdir):
|
||||
env = ModelCatalog.get_preprocessor_as_wrapper(env_creator(
|
||||
config["env_config"]), config["model"])
|
||||
env = ModelCatalog.get_preprocessor_as_wrapper(
|
||||
env_creator(config["env_config"]), config["model"])
|
||||
self.dataset = ExperienceDataset(config["dataset_path"])
|
||||
self.policy = BCPolicy(env.observation_space, env.action_space, config)
|
||||
self.config = config
|
||||
@@ -27,8 +27,10 @@ class BCEvaluator(EvaluatorInterface):
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
gradient, info = self.policy.compute_gradients(samples)
|
||||
self.metrics_queue.put(
|
||||
{"num_samples": info["num_samples"], "loss": info["loss"]})
|
||||
self.metrics_queue.put({
|
||||
"num_samples": info["num_samples"],
|
||||
"loss": info["loss"]
|
||||
})
|
||||
return gradient, {}
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
@@ -42,8 +44,7 @@ class BCEvaluator(EvaluatorInterface):
|
||||
|
||||
def save(self):
|
||||
weights = self.get_weights()
|
||||
return pickle.dumps({
|
||||
"weights": weights})
|
||||
return pickle.dumps({"weights": weights})
|
||||
|
||||
def restore(self, objs):
|
||||
objs = pickle.loads(objs)
|
||||
|
||||
@@ -21,8 +21,9 @@ class ExperienceDataset(object):
|
||||
elements.
|
||||
The file must be available on each machine used by a BCEvaluator.
|
||||
"""
|
||||
self._dataset = list(itertools.chain.from_iterable(
|
||||
pickle.load(open(dataset_path, "rb"))))
|
||||
self._dataset = list(
|
||||
itertools.chain.from_iterable(
|
||||
pickle.load(open(dataset_path, "rb"))))
|
||||
|
||||
def sample(self, batch_size):
|
||||
indexes = np.random.choice(len(self._dataset), batch_size)
|
||||
|
||||
@@ -23,8 +23,8 @@ class BCPolicy(object):
|
||||
self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
|
||||
ac_space, self.config["model"])
|
||||
self._model = ModelCatalog.get_model(
|
||||
self.x, self.logit_dim, self.config["model"])
|
||||
self._model = ModelCatalog.get_model(self.x, self.logit_dim,
|
||||
self.config["model"])
|
||||
self.logits = self._model.outputs
|
||||
self.curr_dist = dist_class(self.logits)
|
||||
self.sample = self.curr_dist.sample()
|
||||
@@ -33,17 +33,16 @@ class BCPolicy(object):
|
||||
|
||||
def setup_loss(self, action_space):
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
self.ac = tf.placeholder(tf.float32,
|
||||
[None] + list(action_space.shape),
|
||||
name="ac")
|
||||
self.ac = tf.placeholder(
|
||||
tf.float32, [None] + list(action_space.shape), name="ac")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
self.ac = tf.placeholder(tf.int64, [None], name="ac")
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"action space" + str(type(action_space)) +
|
||||
"currently not supported")
|
||||
raise NotImplementedError("action space" +
|
||||
str(type(action_space)) +
|
||||
"currently not supported")
|
||||
log_prob = self.curr_dist.logp(self.ac)
|
||||
self.pi_loss = - tf.reduce_sum(log_prob)
|
||||
self.pi_loss = -tf.reduce_sum(log_prob)
|
||||
self.loss = self.pi_loss
|
||||
|
||||
def setup_gradients(self):
|
||||
@@ -62,11 +61,14 @@ class BCPolicy(object):
|
||||
self.summary_op = tf.summary.merge_all()
|
||||
|
||||
# TODO(rliaw): Can consider exposing these parameters
|
||||
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
self.variables = ray.experimental.TensorFlowVariables(self.loss,
|
||||
self.sess)
|
||||
self.sess = tf.Session(
|
||||
graph=self.g,
|
||||
config=tf.ConfigProto(
|
||||
intra_op_parallelism_threads=1,
|
||||
inter_op_parallelism_threads=2,
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
self.loss, self.sess)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
@@ -82,15 +84,14 @@ class BCPolicy(object):
|
||||
[self.loss, self.grads, self.summary_op], feed_dict=feed_dict)
|
||||
info["summary"] = summ
|
||||
else:
|
||||
loss, grad = self.sess.run([self.loss, self.grads],
|
||||
feed_dict=feed_dict)
|
||||
loss, grad = self.sess.run(
|
||||
[self.loss, self.grads], feed_dict=feed_dict)
|
||||
info["num_samples"] = len(samples)
|
||||
info["loss"] = loss
|
||||
return grad, info
|
||||
|
||||
def apply_gradients(self, grads):
|
||||
feed_dict = {self.grads[i]: grads[i]
|
||||
for i in range(len(grads))}
|
||||
feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))}
|
||||
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
|
||||
|
||||
def get_weights(self):
|
||||
|
||||
@@ -9,13 +9,12 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
|
||||
DDPG_CONFIG,
|
||||
{
|
||||
"optimizer_class": "AsyncSamplesOptimizer",
|
||||
"optimizer":
|
||||
merge_dicts(
|
||||
DDPG_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"optimizer": merge_dicts(
|
||||
DDPG_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"n_step": 3,
|
||||
"num_workers": 32,
|
||||
"buffer_size": 2000000,
|
||||
|
||||
@@ -118,9 +118,9 @@ class DDPGAgent(DQNAgent):
|
||||
if self.config["per_worker_exploration"]:
|
||||
assert self.config["num_workers"] > 1, \
|
||||
"This requires multiple workers"
|
||||
return ConstantSchedule(
|
||||
self.config["noise_scale"] * 0.4 **
|
||||
(1 + worker_index / float(self.config["num_workers"] - 1) * 7))
|
||||
exponent = (
|
||||
1 + worker_index / float(self.config["num_workers"] - 1) * 7)
|
||||
return ConstantSchedule(self.config["noise_scale"] * 0.4**exponent)
|
||||
else:
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(self.config["exploration_fraction"] *
|
||||
|
||||
@@ -14,7 +14,6 @@ from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
|
||||
A_SCOPE = "a_func"
|
||||
P_SCOPE = "p_func"
|
||||
P_TARGET_SCOPE = "target_p_func"
|
||||
@@ -26,8 +25,8 @@ class PNetwork(object):
|
||||
"""Maps an observations (i.e., state) to an action where each entry takes
|
||||
value from (0, 1) due to the sigmoid function."""
|
||||
|
||||
def __init__(
|
||||
self, model, dim_actions, hiddens=[64, 64], activation="relu"):
|
||||
def __init__(self, model, dim_actions, hiddens=[64, 64],
|
||||
activation="relu"):
|
||||
action_out = model.last_layer
|
||||
activation = tf.nn.__dict__[activation]
|
||||
for hidden in hiddens:
|
||||
@@ -44,9 +43,14 @@ class ActionNetwork(object):
|
||||
for training, thus ignoring the batch_size issue when constructing a
|
||||
stochastic action."""
|
||||
|
||||
def __init__(
|
||||
self, p_values, low_action, high_action, stochastic, eps,
|
||||
theta=0.15, sigma=0.2):
|
||||
def __init__(self,
|
||||
p_values,
|
||||
low_action,
|
||||
high_action,
|
||||
stochastic,
|
||||
eps,
|
||||
theta=0.15,
|
||||
sigma=0.2):
|
||||
|
||||
# shape is [None, dim_action]
|
||||
deterministic_actions = (
|
||||
@@ -65,15 +69,16 @@ class ActionNetwork(object):
|
||||
stochastic_actions = deterministic_actions + eps * (
|
||||
high_action - low_action) * exploration_value
|
||||
|
||||
self.actions = tf.cond(
|
||||
stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
self.actions = tf.cond(stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
|
||||
|
||||
class QNetwork(object):
|
||||
def __init__(
|
||||
self, model, action_inputs,
|
||||
hiddens=[64, 64], activation="relu"):
|
||||
def __init__(self,
|
||||
model,
|
||||
action_inputs,
|
||||
hiddens=[64, 64],
|
||||
activation="relu"):
|
||||
q_out = tf.concat([model.last_layer, action_inputs], axis=1)
|
||||
activation = tf.nn.__dict__[activation]
|
||||
for hidden in hiddens:
|
||||
@@ -84,14 +89,21 @@ class QNetwork(object):
|
||||
|
||||
|
||||
class ActorCriticLoss(object):
|
||||
def __init__(
|
||||
self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask,
|
||||
gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0):
|
||||
def __init__(self,
|
||||
q_t,
|
||||
q_tp1,
|
||||
q_tp0,
|
||||
importance_weights,
|
||||
rewards,
|
||||
done_mask,
|
||||
gamma=0.99,
|
||||
n_step=1,
|
||||
use_huber=False,
|
||||
huber_threshold=1.0):
|
||||
|
||||
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
|
||||
|
||||
q_tp1_best = tf.squeeze(
|
||||
input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
@@ -131,27 +143,20 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
def _build_q_network(obs, actions):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]),
|
||||
actions,
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), actions,
|
||||
config["critic_hiddens"],
|
||||
config["critic_hidden_activation"]).value
|
||||
|
||||
def _build_p_network(obs):
|
||||
return PNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]),
|
||||
dim_actions,
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
|
||||
config["actor_hiddens"],
|
||||
config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_action_network(p_values, stochastic, eps):
|
||||
return ActionNetwork(
|
||||
p_values,
|
||||
low_action,
|
||||
high_action,
|
||||
stochastic,
|
||||
eps,
|
||||
config["exploration_theta"],
|
||||
config["exploration_sigma"]).actions
|
||||
return ActionNetwork(p_values, low_action, high_action, stochastic,
|
||||
eps, config["exploration_theta"],
|
||||
config["exploration_sigma"]).actions
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
@@ -263,9 +268,13 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
("weights", self.importance_weights),
|
||||
]
|
||||
TFPolicyGraph.__init__(
|
||||
self, observation_space, action_space, self.sess,
|
||||
self,
|
||||
observation_space,
|
||||
action_space,
|
||||
self.sess,
|
||||
obs_input=self.cur_observations,
|
||||
action_sampler=self.output_actions, loss=self.loss.total_loss,
|
||||
action_sampler=self.output_actions,
|
||||
loss=self.loss.total_loss,
|
||||
loss_inputs=self.loss_inputs)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
@@ -294,10 +303,10 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
self.loss.actor_loss, var_list=self.p_func_vars)
|
||||
critic_grads_and_vars = self.critic_optimizer.compute_gradients(
|
||||
self.loss.critic_loss, var_list=self.q_func_vars)
|
||||
actor_grads_and_vars = [
|
||||
(g, v) for (g, v) in actor_grads_and_vars if g is not None]
|
||||
critic_grads_and_vars = [
|
||||
(g, v) for (g, v) in critic_grads_and_vars if g is not None]
|
||||
actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
|
||||
if g is not None]
|
||||
critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
|
||||
if g is not None]
|
||||
grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
|
||||
return grads_and_vars
|
||||
|
||||
|
||||
@@ -10,13 +10,12 @@ APEX_DEFAULT_CONFIG = merge_dicts(
|
||||
DQN_CONFIG,
|
||||
{
|
||||
"optimizer_class": "AsyncSamplesOptimizer",
|
||||
"optimizer":
|
||||
merge_dicts(
|
||||
DQN_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"optimizer": merge_dicts(
|
||||
DQN_CONFIG["optimizer"], {
|
||||
"max_weight_sync_delay": 400,
|
||||
"num_replay_buffer_shards": 4,
|
||||
"debug": False
|
||||
}),
|
||||
"n_step": 3,
|
||||
"gpu": True,
|
||||
"num_workers": 32,
|
||||
|
||||
@@ -13,11 +13,11 @@ from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
|
||||
from ray.tune.trial import Resources
|
||||
|
||||
|
||||
OPTIMIZER_SHARED_CONFIGS = [
|
||||
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
|
||||
"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
|
||||
"train_batch_size", "learning_starts", "clip_rewards"]
|
||||
"train_batch_size", "learning_starts", "clip_rewards"
|
||||
]
|
||||
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# === Model ===
|
||||
@@ -110,7 +110,8 @@ class DQNAgent(Agent):
|
||||
def default_resource_request(cls, config):
|
||||
cf = dict(cls._default_config, **config)
|
||||
return Resources(
|
||||
cpu=1, gpu=cf["gpu"] and 1 or 0,
|
||||
cpu=1,
|
||||
gpu=cf["gpu"] and 1 or 0,
|
||||
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
|
||||
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
|
||||
|
||||
@@ -123,7 +124,8 @@ class DQNAgent(Agent):
|
||||
self.exploration0 = self._make_exploration_schedule(0)
|
||||
self.explorations = [
|
||||
self._make_exploration_schedule(i)
|
||||
for i in range(self.config["num_workers"])]
|
||||
for i in range(self.config["num_workers"])
|
||||
]
|
||||
|
||||
for k in OPTIMIZER_SHARED_CONFIGS:
|
||||
if k not in self.config["optimizer"]:
|
||||
@@ -132,9 +134,10 @@ class DQNAgent(Agent):
|
||||
self.local_evaluator = self.make_local_evaluator(
|
||||
self.env_creator, self._policy_graph)
|
||||
self.remote_evaluators = self.make_remote_evaluators(
|
||||
self.env_creator, self._policy_graph, self.config["num_workers"],
|
||||
{"num_cpus": self.config["num_cpus_per_worker"],
|
||||
"num_gpus": self.config["num_gpus_per_worker"]})
|
||||
self.env_creator, self._policy_graph, self.config["num_workers"], {
|
||||
"num_cpus": self.config["num_cpus_per_worker"],
|
||||
"num_gpus": self.config["num_gpus_per_worker"]
|
||||
})
|
||||
self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
|
||||
self.local_evaluator, self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
@@ -147,14 +150,12 @@ class DQNAgent(Agent):
|
||||
if self.config["per_worker_exploration"]:
|
||||
assert self.config["num_workers"] > 1, \
|
||||
"This requires multiple workers"
|
||||
return ConstantSchedule(
|
||||
0.4 ** (
|
||||
1 + worker_index / float(
|
||||
self.config["num_workers"] - 1) * 7))
|
||||
exponent = (
|
||||
1 + worker_index / float(self.config["num_workers"] - 1) * 7)
|
||||
return ConstantSchedule(0.4**exponent)
|
||||
return LinearSchedule(
|
||||
schedule_timesteps=int(
|
||||
self.config["exploration_fraction"] *
|
||||
self.config["schedule_max_timesteps"]),
|
||||
schedule_timesteps=int(self.config["exploration_fraction"] *
|
||||
self.config["schedule_max_timesteps"]),
|
||||
initial_p=1.0,
|
||||
final_p=self.config["exploration_final_eps"])
|
||||
|
||||
@@ -191,8 +192,8 @@ class DQNAgent(Agent):
|
||||
self.local_evaluator,
|
||||
self.remote_evaluators[-len(self.remote_evaluators) // 3:])
|
||||
else:
|
||||
result = collect_metrics(
|
||||
self.local_evaluator, self.remote_evaluators)
|
||||
result = collect_metrics(self.local_evaluator,
|
||||
self.remote_evaluators)
|
||||
|
||||
return result._replace(
|
||||
timesteps_this_iter=self.global_timestep - start_timestep,
|
||||
@@ -208,14 +209,14 @@ class DQNAgent(Agent):
|
||||
ev.__ray_terminate__.remote()
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
checkpoint_path = os.path.join(
|
||||
checkpoint_dir, "checkpoint-{}".format(self.iteration))
|
||||
checkpoint_path = os.path.join(checkpoint_dir,
|
||||
"checkpoint-{}".format(self.iteration))
|
||||
extra_data = [
|
||||
self.local_evaluator.save(),
|
||||
ray.get([e.save.remote() for e in self.remote_evaluators]),
|
||||
self.optimizer.save(),
|
||||
self.num_target_updates,
|
||||
self.last_target_update_ts]
|
||||
self.optimizer.save(), self.num_target_updates,
|
||||
self.last_target_update_ts
|
||||
]
|
||||
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
|
||||
return checkpoint_path
|
||||
|
||||
@@ -223,8 +224,9 @@ class DQNAgent(Agent):
|
||||
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
|
||||
self.local_evaluator.restore(extra_data[0])
|
||||
ray.get([
|
||||
e.restore.remote(d) for (d, e)
|
||||
in zip(extra_data[1], self.remote_evaluators)])
|
||||
e.restore.remote(d)
|
||||
for (d, e) in zip(extra_data[1], self.remote_evaluators)
|
||||
])
|
||||
self.optimizer.restore(extra_data[2])
|
||||
self.num_target_updates = extra_data[3]
|
||||
self.last_target_update_ts = extra_data[4]
|
||||
|
||||
@@ -13,7 +13,6 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
|
||||
|
||||
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
|
||||
@@ -33,7 +32,8 @@ class QNetwork(object):
|
||||
state_out = model.last_layer
|
||||
for hidden in hiddens:
|
||||
state_out = layers.fully_connected(
|
||||
state_out, num_outputs=hidden,
|
||||
state_out,
|
||||
num_outputs=hidden,
|
||||
activation_fn=tf.nn.relu)
|
||||
state_score = layers.fully_connected(
|
||||
state_out, num_outputs=1, activation_fn=None)
|
||||
@@ -50,26 +50,32 @@ class QValuePolicy(object):
|
||||
deterministic_actions = tf.argmax(q_values, axis=1)
|
||||
batch_size = tf.shape(observations)[0]
|
||||
random_actions = tf.random_uniform(
|
||||
tf.stack([batch_size]), minval=0, maxval=num_actions,
|
||||
tf.stack([batch_size]),
|
||||
minval=0,
|
||||
maxval=num_actions,
|
||||
dtype=tf.int64)
|
||||
chose_random = tf.random_uniform(
|
||||
tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
|
||||
stochastic_actions = tf.where(
|
||||
chose_random, random_actions, deterministic_actions)
|
||||
self.action = tf.cond(
|
||||
stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
stochastic_actions = tf.where(chose_random, random_actions,
|
||||
deterministic_actions)
|
||||
self.action = tf.cond(stochastic, lambda: stochastic_actions,
|
||||
lambda: deterministic_actions)
|
||||
|
||||
|
||||
class QLoss(object):
|
||||
def __init__(
|
||||
self, q_t_selected, q_tp1_best, importance_weights, rewards,
|
||||
done_mask, gamma=0.99, n_step=1):
|
||||
def __init__(self,
|
||||
q_t_selected,
|
||||
q_tp1_best,
|
||||
importance_weights,
|
||||
rewards,
|
||||
done_mask,
|
||||
gamma=0.99,
|
||||
n_step=1):
|
||||
|
||||
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
|
||||
|
||||
# compute RHS of bellman equation
|
||||
q_t_selected_target = rewards + gamma ** n_step * q_tp1_best_masked
|
||||
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
|
||||
|
||||
# compute the error (potentially clipped)
|
||||
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
|
||||
@@ -91,14 +97,14 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
def _build_q_network(obs):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, config["model"]),
|
||||
num_actions, config["dueling"], config["hiddens"]).value
|
||||
ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
|
||||
config["dueling"], config["hiddens"]).value
|
||||
|
||||
# Action inputs
|
||||
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
|
||||
self.eps = tf.placeholder(tf.float32, (), name="eps")
|
||||
self.cur_observations = tf.placeholder(
|
||||
tf.float32, shape=(None,) + observation_space.shape)
|
||||
tf.float32, shape=(None, ) + observation_space.shape)
|
||||
|
||||
# Action Q network
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
@@ -106,20 +112,17 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
self.output_actions = QValuePolicy(
|
||||
q_values,
|
||||
self.cur_observations,
|
||||
num_actions,
|
||||
self.stochastic,
|
||||
self.eps).action
|
||||
self.output_actions = QValuePolicy(q_values, self.cur_observations,
|
||||
num_actions, self.stochastic,
|
||||
self.eps).action
|
||||
|
||||
# Replay inputs
|
||||
self.obs_t = tf.placeholder(
|
||||
tf.float32, shape=(None,) + observation_space.shape)
|
||||
tf.float32, shape=(None, ) + observation_space.shape)
|
||||
self.act_t = tf.placeholder(tf.int32, [None], name="action")
|
||||
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
|
||||
self.obs_tp1 = tf.placeholder(
|
||||
tf.float32, shape=(None,) + observation_space.shape)
|
||||
tf.float32, shape=(None, ) + observation_space.shape)
|
||||
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
|
||||
self.importance_weights = tf.placeholder(
|
||||
tf.float32, [None], name="weight")
|
||||
@@ -134,8 +137,8 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
q_t_selected = tf.reduce_sum(
|
||||
q_t * tf.one_hot(self.act_t, num_actions), 1)
|
||||
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
|
||||
1)
|
||||
|
||||
# compute estimate of best possible value starting from state at t + 1
|
||||
if config["double_q"]:
|
||||
@@ -143,20 +146,20 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best = tf.reduce_sum(
|
||||
q_tp1 * tf.one_hot(
|
||||
q_tp1_best_using_online_net, num_actions), 1)
|
||||
q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
|
||||
1)
|
||||
else:
|
||||
q_tp1_best = tf.reduce_max(q_tp1, 1)
|
||||
|
||||
self.loss = QLoss(
|
||||
q_t_selected, q_tp1_best, self.importance_weights,
|
||||
self.rew_t, self.done_mask, config["gamma"], config["n_step"])
|
||||
self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
|
||||
self.rew_t, self.done_mask, config["gamma"],
|
||||
config["n_step"])
|
||||
|
||||
# update_target_fn will be called periodically to copy Q network to
|
||||
# target Q network
|
||||
update_target_expr = []
|
||||
for var, var_target in zip(
|
||||
sorted(self.q_func_vars, key=lambda v: v.name),
|
||||
sorted(self.q_func_vars, key=lambda v: v.name),
|
||||
sorted(self.target_q_func_vars, key=lambda v: v.name)):
|
||||
update_target_expr.append(var_target.assign(var))
|
||||
self.update_target_expr = tf.group(*update_target_expr)
|
||||
@@ -172,9 +175,13 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
("weights", self.importance_weights),
|
||||
]
|
||||
TFPolicyGraph.__init__(
|
||||
self, observation_space, action_space, self.sess,
|
||||
self,
|
||||
observation_space,
|
||||
action_space,
|
||||
self.sess,
|
||||
obs_input=self.cur_observations,
|
||||
action_sampler=self.output_actions, loss=self.loss.loss,
|
||||
action_sampler=self.output_actions,
|
||||
loss=self.loss.loss,
|
||||
loss_inputs=self.loss_inputs)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
@@ -184,13 +191,14 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
def gradients(self, optimizer):
|
||||
if self.config["grad_norm_clipping"] is not None:
|
||||
grads_and_vars = _minimize_and_clip(
|
||||
optimizer, self.loss.loss, var_list=self.q_func_vars,
|
||||
optimizer,
|
||||
self.loss.loss,
|
||||
var_list=self.q_func_vars,
|
||||
clip_val=self.config["grad_norm_clipping"])
|
||||
else:
|
||||
grads_and_vars = optimizer.compute_gradients(
|
||||
self.loss.loss, var_list=self.q_func_vars)
|
||||
grads_and_vars = [
|
||||
(g, v) for (g, v) in grads_and_vars if g is not None]
|
||||
grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
|
||||
return grads_and_vars
|
||||
|
||||
def extra_compute_action_feed_dict(self):
|
||||
@@ -207,8 +215,8 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
return _postprocess_dqn(self, sample_batch)
|
||||
|
||||
def compute_td_error(
|
||||
self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
|
||||
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
|
||||
importance_weights):
|
||||
td_err = self.sess.run(
|
||||
self.loss.td_error,
|
||||
feed_dict={
|
||||
@@ -254,7 +262,7 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
|
||||
continue # episode end
|
||||
for j in range(1, n_step):
|
||||
new_obs[i] = new_obs[i + j]
|
||||
rewards[i] += gamma ** j * rewards[i + j]
|
||||
rewards[i] += gamma**j * rewards[i + j]
|
||||
if dones[i + j]:
|
||||
break # episode end
|
||||
# truncate ends of the trajectory
|
||||
@@ -266,24 +274,29 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
|
||||
def _postprocess_dqn(policy_graph, sample_batch):
|
||||
obs, actions, rewards, new_obs, dones = [
|
||||
list(x) for x in sample_batch.columns(
|
||||
["obs", "actions", "rewards", "new_obs", "dones"])]
|
||||
["obs", "actions", "rewards", "new_obs", "dones"])
|
||||
]
|
||||
|
||||
# N-step Q adjustments
|
||||
if policy_graph.config["n_step"] > 1:
|
||||
adjust_nstep(
|
||||
policy_graph.config["n_step"], policy_graph.config["gamma"],
|
||||
obs, actions, rewards, new_obs, dones)
|
||||
adjust_nstep(policy_graph.config["n_step"],
|
||||
policy_graph.config["gamma"], obs, actions, rewards,
|
||||
new_obs, dones)
|
||||
|
||||
batch = SampleBatch({
|
||||
"obs": obs, "actions": actions, "rewards": rewards,
|
||||
"new_obs": new_obs, "dones": dones,
|
||||
"weights": np.ones_like(rewards)})
|
||||
"obs": obs,
|
||||
"actions": actions,
|
||||
"rewards": rewards,
|
||||
"new_obs": new_obs,
|
||||
"dones": dones,
|
||||
"weights": np.ones_like(rewards)
|
||||
})
|
||||
|
||||
# Prioritize on the worker side
|
||||
if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
|
||||
td_errors = policy_graph.compute_td_error(
|
||||
batch["obs"], batch["actions"], batch["rewards"],
|
||||
batch["new_obs"], batch["dones"], batch["weights"])
|
||||
batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"],
|
||||
batch["dones"], batch["weights"])
|
||||
new_priorities = (
|
||||
np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
|
||||
batch.data["weights"] = new_priorities
|
||||
@@ -295,8 +308,7 @@ def _huber_loss(x, delta=1.0):
|
||||
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
|
||||
return tf.where(
|
||||
tf.abs(x) < delta,
|
||||
tf.square(x) * 0.5,
|
||||
delta * (tf.abs(x) - 0.5 * delta))
|
||||
tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
|
||||
|
||||
|
||||
def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
|
||||
|
||||
@@ -20,13 +20,11 @@ from ray.rllib.agents.es import policies
|
||||
from ray.rllib.agents.es import tabular_logger as tlogger
|
||||
from ray.rllib.agents.es import utils
|
||||
|
||||
|
||||
Result = namedtuple("Result", [
|
||||
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
|
||||
"eval_returns", "eval_lengths"
|
||||
])
|
||||
|
||||
|
||||
DEFAULT_CONFIG = {
|
||||
'l2_coeff': 0.005,
|
||||
'noise_stdev': 0.02,
|
||||
@@ -64,7 +62,11 @@ class SharedNoiseTable(object):
|
||||
|
||||
@ray.remote
|
||||
class Worker(object):
|
||||
def __init__(self, config, policy_params, env_creator, noise,
|
||||
def __init__(self,
|
||||
config,
|
||||
policy_params,
|
||||
env_creator,
|
||||
noise,
|
||||
min_task_runtime=0.2):
|
||||
self.min_task_runtime = min_task_runtime
|
||||
self.config = config
|
||||
@@ -82,7 +84,9 @@ class Worker(object):
|
||||
|
||||
def rollout(self, timestep_limit, add_noise=True):
|
||||
rollout_rewards, rollout_length = policies.rollout(
|
||||
self.policy, self.env, timestep_limit=timestep_limit,
|
||||
self.policy,
|
||||
self.env,
|
||||
timestep_limit=timestep_limit,
|
||||
add_noise=add_noise)
|
||||
return rollout_rewards, rollout_length
|
||||
|
||||
@@ -95,8 +99,8 @@ class Worker(object):
|
||||
|
||||
# Perform some rollouts with noise.
|
||||
task_tstart = time.time()
|
||||
while (len(noise_indices) == 0 or
|
||||
time.time() - task_tstart < self.min_task_runtime):
|
||||
while (len(noise_indices) == 0
|
||||
or time.time() - task_tstart < self.min_task_runtime):
|
||||
|
||||
if np.random.uniform() < self.config["eval_prob"]:
|
||||
# Do an evaluation run with no perturbation.
|
||||
@@ -122,7 +126,8 @@ class Worker(object):
|
||||
noise_indices.append(noise_index)
|
||||
returns.append([rewards_pos.sum(), rewards_neg.sum()])
|
||||
sign_returns.append(
|
||||
[np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()])
|
||||
[np.sign(rewards_pos).sum(),
|
||||
np.sign(rewards_neg).sum()])
|
||||
lengths.append([lengths_pos, lengths_neg])
|
||||
|
||||
return Result(
|
||||
@@ -146,9 +151,7 @@ class ESAgent(Agent):
|
||||
return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])
|
||||
|
||||
def _init(self):
|
||||
policy_params = {
|
||||
"action_noise_std": 0.01
|
||||
}
|
||||
policy_params = {"action_noise_std": 0.01}
|
||||
|
||||
env = self.env_creator(self.config["env_config"])
|
||||
from ray.rllib import models
|
||||
@@ -168,9 +171,9 @@ class ESAgent(Agent):
|
||||
# Create the actors.
|
||||
print("Creating actors.")
|
||||
self.workers = [
|
||||
Worker.remote(
|
||||
self.config, policy_params, self.env_creator, noise_id)
|
||||
for _ in range(self.config["num_workers"])]
|
||||
Worker.remote(self.config, policy_params, self.env_creator,
|
||||
noise_id) for _ in range(self.config["num_workers"])
|
||||
]
|
||||
|
||||
self.episodes_so_far = 0
|
||||
self.timesteps_so_far = 0
|
||||
@@ -180,21 +183,20 @@ class ESAgent(Agent):
|
||||
num_episodes, num_timesteps = 0, 0
|
||||
results = []
|
||||
while num_episodes < min_episodes or num_timesteps < min_timesteps:
|
||||
print(
|
||||
"Collected {} episodes {} timesteps so far this iter".format(
|
||||
num_episodes, num_timesteps))
|
||||
rollout_ids = [worker.do_rollouts.remote(theta_id)
|
||||
for worker in self.workers]
|
||||
print("Collected {} episodes {} timesteps so far this iter".format(
|
||||
num_episodes, num_timesteps))
|
||||
rollout_ids = [
|
||||
worker.do_rollouts.remote(theta_id) for worker in self.workers
|
||||
]
|
||||
# Get the results of the rollouts.
|
||||
for result in ray.get(rollout_ids):
|
||||
results.append(result)
|
||||
# Update the number of episodes and the number of timesteps
|
||||
# keeping in mind that result.noisy_lengths is a list of lists,
|
||||
# where the inner lists have length 2.
|
||||
num_episodes += sum(len(pair) for pair
|
||||
in result.noisy_lengths)
|
||||
num_timesteps += sum(sum(pair) for pair
|
||||
in result.noisy_lengths)
|
||||
num_episodes += sum(len(pair) for pair in result.noisy_lengths)
|
||||
num_timesteps += sum(
|
||||
sum(pair) for pair in result.noisy_lengths)
|
||||
return results, num_episodes, num_timesteps
|
||||
|
||||
def _train(self):
|
||||
@@ -209,8 +211,7 @@ class ESAgent(Agent):
|
||||
# Use the actors to do rollouts, note that we pass in the ID of the
|
||||
# policy weights.
|
||||
results, num_episodes, num_timesteps = self._collect_results(
|
||||
theta_id,
|
||||
config["episodes_per_batch"],
|
||||
theta_id, config["episodes_per_batch"],
|
||||
config["timesteps_per_batch"])
|
||||
|
||||
all_noise_indices = []
|
||||
@@ -255,13 +256,11 @@ class ESAgent(Agent):
|
||||
for index in noise_indices),
|
||||
batch_size=500)
|
||||
g /= noisy_returns.size
|
||||
assert (
|
||||
g.shape == (self.policy.num_params,) and
|
||||
g.dtype == np.float32 and
|
||||
count == len(noise_indices))
|
||||
assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
|
||||
and count == len(noise_indices))
|
||||
# Compute the new weights theta.
|
||||
theta, update_ratio = self.optimizer.update(
|
||||
-g + config["l2_coeff"] * theta)
|
||||
theta, update_ratio = self.optimizer.update(-g +
|
||||
config["l2_coeff"] * theta)
|
||||
# Set the new weights in the local copy of the policy.
|
||||
self.policy.set_weights(theta)
|
||||
|
||||
@@ -313,13 +312,10 @@ class ESAgent(Agent):
|
||||
w.__ray_terminate__.remote()
|
||||
|
||||
def _save(self, checkpoint_dir):
|
||||
checkpoint_path = os.path.join(
|
||||
checkpoint_dir, "checkpoint-{}".format(self.iteration))
|
||||
checkpoint_path = os.path.join(checkpoint_dir,
|
||||
"checkpoint-{}".format(self.iteration))
|
||||
weights = self.policy.get_weights()
|
||||
objects = [
|
||||
weights,
|
||||
self.episodes_so_far,
|
||||
self.timesteps_so_far]
|
||||
objects = [weights, self.episodes_so_far, self.timesteps_so_far]
|
||||
pickle.dump(objects, open(checkpoint_path, "wb"))
|
||||
return checkpoint_path
|
||||
|
||||
|
||||
@@ -48,8 +48,8 @@ class Adam(Optimizer):
|
||||
self.v = np.zeros(self.dim, dtype=np.float32)
|
||||
|
||||
def _compute_step(self, globalg):
|
||||
a = self.stepsize * (np.sqrt(1 - self.beta2 ** self.t) /
|
||||
(1 - self.beta1 ** self.t))
|
||||
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
|
||||
(1 - self.beta1**self.t))
|
||||
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
|
||||
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
|
||||
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
|
||||
|
||||
@@ -21,8 +21,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
|
||||
noise drawn from that stream. Otherwise, no action noise will be added.
|
||||
"""
|
||||
env_timestep_limit = env.spec.max_episode_steps
|
||||
timestep_limit = (env_timestep_limit if timestep_limit is None
|
||||
else min(timestep_limit, env_timestep_limit))
|
||||
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
|
||||
timestep_limit, env_timestep_limit))
|
||||
rews = []
|
||||
t = 0
|
||||
observation = env.reset()
|
||||
@@ -38,16 +38,16 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
|
||||
|
||||
|
||||
class GenericPolicy(object):
|
||||
def __init__(self, sess, action_space, preprocessor,
|
||||
observation_filter, action_noise_std):
|
||||
def __init__(self, sess, action_space, preprocessor, observation_filter,
|
||||
action_noise_std):
|
||||
self.sess = sess
|
||||
self.action_space = action_space
|
||||
self.action_noise_std = action_noise_std
|
||||
self.preprocessor = preprocessor
|
||||
self.observation_filter = get_filter(
|
||||
observation_filter, self.preprocessor.shape)
|
||||
self.inputs = tf.placeholder(
|
||||
tf.float32, [None] + list(self.preprocessor.shape))
|
||||
self.observation_filter = get_filter(observation_filter,
|
||||
self.preprocessor.shape)
|
||||
self.inputs = tf.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
|
||||
# Policy network.
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
@@ -59,16 +59,16 @@ class GenericPolicy(object):
|
||||
self.variables = ray.experimental.TensorFlowVariables(
|
||||
model.outputs, self.sess)
|
||||
|
||||
self.num_params = sum(np.prod(variable.shape.as_list())
|
||||
for _, variable
|
||||
in self.variables.variables.items())
|
||||
self.num_params = sum(
|
||||
np.prod(variable.shape.as_list())
|
||||
for _, variable in self.variables.variables.items())
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def compute(self, observation, add_noise=False, update=True):
|
||||
observation = self.preprocessor.transform(observation)
|
||||
observation = self.observation_filter(observation[None], update=update)
|
||||
action = self.sess.run(self.sampler,
|
||||
feed_dict={self.inputs: observation})
|
||||
action = self.sess.run(
|
||||
self.sampler, feed_dict={self.inputs: observation})
|
||||
if add_noise and isinstance(self.action_space, gym.spaces.Box):
|
||||
action += np.random.randn(*action.shape) * self.action_noise_std
|
||||
return action
|
||||
|
||||
@@ -25,6 +25,7 @@ DISABLED = 50
|
||||
|
||||
class TbWriter(object):
|
||||
"""Based on SummaryWriter, but changed to allow for a different prefix."""
|
||||
|
||||
def __init__(self, dir, prefix):
|
||||
self.dir = dir
|
||||
# Start at 1, because EvWriter automatically generates an object with
|
||||
@@ -34,9 +35,10 @@ class TbWriter(object):
|
||||
compat.as_bytes(os.path.join(dir, prefix)))
|
||||
|
||||
def write_values(self, key2val):
|
||||
summary = tf.Summary(value=[tf.Summary.Value(tag=k,
|
||||
simple_value=float(v))
|
||||
for (k, v) in key2val.items()])
|
||||
summary = tf.Summary(value=[
|
||||
tf.Summary.Value(tag=k, simple_value=float(v))
|
||||
for (k, v) in key2val.items()
|
||||
])
|
||||
event = event_pb2.Event(wall_time=time.time(), summary=summary)
|
||||
event.step = self.step
|
||||
self.evwriter.WriteEvent(event)
|
||||
@@ -46,6 +48,7 @@ class TbWriter(object):
|
||||
def close(self):
|
||||
self.evwriter.Close()
|
||||
|
||||
|
||||
# API
|
||||
|
||||
|
||||
@@ -126,6 +129,7 @@ def get_expt_dir():
|
||||
sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir()\n")
|
||||
return get_dir()
|
||||
|
||||
|
||||
# Backend
|
||||
|
||||
|
||||
@@ -167,8 +171,8 @@ class _Logger(object):
|
||||
# Write to all text outputs
|
||||
self._write_text("-" * (keywidth + valwidth + 7), "\n")
|
||||
for (key, val) in key2str.items():
|
||||
self._write_text("| ", key, " " * (keywidth - len(key)),
|
||||
" | ", val, " " * (valwidth - len(val)), " |\n")
|
||||
self._write_text("| ", key, " " * (keywidth - len(key)), " | ",
|
||||
val, " " * (valwidth - len(val)), " |\n")
|
||||
self._write_text("-" * (keywidth + valwidth + 7), "\n")
|
||||
for f in self.text_outputs:
|
||||
try:
|
||||
@@ -202,7 +206,7 @@ class _Logger(object):
|
||||
# Misc
|
||||
|
||||
def _do_log(self, *args):
|
||||
self._write_text(*args + ('\n',))
|
||||
self._write_text(*args + ('\n', ))
|
||||
for f in self.text_outputs:
|
||||
try:
|
||||
f.flush()
|
||||
|
||||
@@ -31,8 +31,9 @@ def compute_centered_ranks(x):
|
||||
def make_session(single_threaded):
|
||||
if not single_threaded:
|
||||
return tf.Session()
|
||||
return tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1,
|
||||
intra_op_parallelism_threads=1))
|
||||
return tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
|
||||
|
||||
|
||||
def itergroups(items, group_size):
|
||||
@@ -50,10 +51,11 @@ def itergroups(items, group_size):
|
||||
def batched_weighted_sum(weights, vecs, batch_size):
|
||||
total = 0
|
||||
num_items_summed = 0
|
||||
for batch_weights, batch_vecs in zip(itergroups(weights, batch_size),
|
||||
itergroups(vecs, batch_size)):
|
||||
for batch_weights, batch_vecs in zip(
|
||||
itergroups(weights, batch_size), itergroups(vecs, batch_size)):
|
||||
assert len(batch_weights) == len(batch_vecs) <= batch_size
|
||||
total += np.dot(np.asarray(batch_weights, dtype=np.float32),
|
||||
np.asarray(batch_vecs, dtype=np.float32))
|
||||
total += np.dot(
|
||||
np.asarray(batch_weights, dtype=np.float32),
|
||||
np.asarray(batch_vecs, dtype=np.float32))
|
||||
num_items_summed += len(batch_weights)
|
||||
return total, num_items_summed
|
||||
|
||||
@@ -7,7 +7,6 @@ from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
|
||||
from ray.rllib.optimizers import SyncSamplesOptimizer
|
||||
from ray.tune.trial import Resources
|
||||
|
||||
|
||||
DEFAULT_CONFIG = with_common_config({
|
||||
# No remote workers by default
|
||||
"num_workers": 0,
|
||||
@@ -43,9 +42,9 @@ class PGAgent(Agent):
|
||||
self.env_creator, PGPolicyGraph)
|
||||
self.remote_evaluators = self.make_remote_evaluators(
|
||||
self.env_creator, PGPolicyGraph, self.config["num_workers"], {})
|
||||
self.optimizer = SyncSamplesOptimizer(
|
||||
self.local_evaluator, self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
|
||||
self.remote_evaluators,
|
||||
self.config["optimizer"])
|
||||
|
||||
def _train(self):
|
||||
prev_steps = self.optimizer.num_steps_sampled
|
||||
|
||||
@@ -42,9 +42,15 @@ class PGPolicyGraph(TFPolicyGraph):
|
||||
]
|
||||
|
||||
TFPolicyGraph.__init__(
|
||||
self, obs_space, action_space, sess, obs_input=obs,
|
||||
action_sampler=action_dist.sample(), loss=loss,
|
||||
loss_inputs=loss_in, state_inputs=self.model.state_in,
|
||||
self,
|
||||
obs_space,
|
||||
action_space,
|
||||
sess,
|
||||
obs_input=obs,
|
||||
action_sampler=action_dist.sample(),
|
||||
loss=loss,
|
||||
loss_inputs=loss_in,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=config["model"]["max_seq_len"])
|
||||
|
||||
@@ -77,28 +77,30 @@ class PPOAgent(Agent):
|
||||
self.local_evaluator = self.make_local_evaluator(
|
||||
self.env_creator, PPOPolicyGraph)
|
||||
self.remote_evaluators = self.make_remote_evaluators(
|
||||
self.env_creator, PPOPolicyGraph, self.config["num_workers"],
|
||||
{"num_cpus": self.config["num_cpus_per_worker"],
|
||||
"num_gpus": self.config["num_gpus_per_worker"]})
|
||||
self.env_creator, PPOPolicyGraph, self.config["num_workers"], {
|
||||
"num_cpus": self.config["num_cpus_per_worker"],
|
||||
"num_gpus": self.config["num_gpus_per_worker"]
|
||||
})
|
||||
if self.config["simple_optimizer"]:
|
||||
self.optimizer = SyncSamplesOptimizer(
|
||||
self.local_evaluator, self.remote_evaluators,
|
||||
{"num_sgd_iter": self.config["num_sgd_iter"]})
|
||||
else:
|
||||
self.optimizer = LocalMultiGPUOptimizer(
|
||||
self.local_evaluator, self.remote_evaluators,
|
||||
{"sgd_batch_size": self.config["sgd_batchsize"],
|
||||
"sgd_stepsize": self.config["sgd_stepsize"],
|
||||
"num_sgd_iter": self.config["num_sgd_iter"],
|
||||
"timesteps_per_batch": self.config["timesteps_per_batch"],
|
||||
"standardize_fields": ["advantages"]})
|
||||
self.local_evaluator, self.remote_evaluators, {
|
||||
"sgd_batch_size": self.config["sgd_batchsize"],
|
||||
"sgd_stepsize": self.config["sgd_stepsize"],
|
||||
"num_sgd_iter": self.config["num_sgd_iter"],
|
||||
"timesteps_per_batch": self.config["timesteps_per_batch"],
|
||||
"standardize_fields": ["advantages"]
|
||||
})
|
||||
|
||||
def _train(self):
|
||||
prev_steps = self.optimizer.num_steps_sampled
|
||||
fetches = self.optimizer.step()
|
||||
self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"]))
|
||||
FilterManager.synchronize(
|
||||
self.local_evaluator.filters, self.remote_evaluators)
|
||||
FilterManager.synchronize(self.local_evaluator.filters,
|
||||
self.remote_evaluators)
|
||||
res = self.optimizer.collect_metrics()
|
||||
res = res._replace(
|
||||
timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
|
||||
@@ -115,9 +117,7 @@ class PPOAgent(Agent):
|
||||
"checkpoint-{}".format(self.iteration))
|
||||
agent_state = ray.get(
|
||||
[a.save.remote() for a in self.remote_evaluators])
|
||||
extra_data = [
|
||||
self.local_evaluator.save(),
|
||||
agent_state]
|
||||
extra_data = [self.local_evaluator.save(), agent_state]
|
||||
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
|
||||
return checkpoint_path
|
||||
|
||||
@@ -126,4 +126,5 @@ class PPOAgent(Agent):
|
||||
self.local_evaluator.restore(extra_data[0])
|
||||
ray.get([
|
||||
a.restore.remote(o)
|
||||
for (a, o) in zip(self.remote_evaluators, extra_data[1])])
|
||||
for (a, o) in zip(self.remote_evaluators, extra_data[1])
|
||||
])
|
||||
|
||||
@@ -10,10 +10,20 @@ from ray.rllib.models.catalog import ModelCatalog
|
||||
|
||||
|
||||
class PPOLoss(object):
|
||||
def __init__(
|
||||
self, action_space, value_targets, advantages, actions, logits,
|
||||
vf_preds, curr_action_dist, value_fn, cur_kl_coeff,
|
||||
entropy_coeff=0, clip_param=0.1, vf_loss_coeff=1.0, use_gae=True):
|
||||
def __init__(self,
|
||||
action_space,
|
||||
value_targets,
|
||||
advantages,
|
||||
actions,
|
||||
logits,
|
||||
vf_preds,
|
||||
curr_action_dist,
|
||||
value_fn,
|
||||
cur_kl_coeff,
|
||||
entropy_coeff=0,
|
||||
clip_param=0.1,
|
||||
vf_loss_coeff=1.0,
|
||||
use_gae=True):
|
||||
"""Constructs the loss for Proximal Policy Objective.
|
||||
|
||||
Arguments:
|
||||
@@ -51,31 +61,33 @@ class PPOLoss(object):
|
||||
|
||||
surrogate_loss = tf.minimum(
|
||||
advantages * logp_ratio,
|
||||
advantages * tf.clip_by_value(
|
||||
logp_ratio, 1 - clip_param, 1 + clip_param))
|
||||
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
|
||||
1 + clip_param))
|
||||
self.mean_policy_loss = tf.reduce_mean(-surrogate_loss)
|
||||
|
||||
if use_gae:
|
||||
vf_loss1 = tf.square(value_fn - value_targets)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(
|
||||
value_fn - vf_preds, -clip_param, clip_param)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
|
||||
-clip_param, clip_param)
|
||||
vf_loss2 = tf.square(vf_clipped - value_targets)
|
||||
vf_loss = tf.maximum(vf_loss1, vf_loss2)
|
||||
self.mean_vf_loss = tf.reduce_mean(vf_loss)
|
||||
loss = tf.reduce_mean(
|
||||
-surrogate_loss + cur_kl_coeff*action_kl +
|
||||
vf_loss_coeff*vf_loss - entropy_coeff*curr_entropy)
|
||||
loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl +
|
||||
vf_loss_coeff * vf_loss -
|
||||
entropy_coeff * curr_entropy)
|
||||
else:
|
||||
self.mean_vf_loss = tf.constant(0.0)
|
||||
loss = tf.reduce_mean(
|
||||
-surrogate_loss + cur_kl_coeff*action_kl -
|
||||
entropy_coeff*curr_entropy)
|
||||
loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl -
|
||||
entropy_coeff * curr_entropy)
|
||||
self.loss = loss
|
||||
|
||||
|
||||
class PPOPolicyGraph(TFPolicyGraph):
|
||||
def __init__(self, observation_space, action_space,
|
||||
config, existing_inputs=None):
|
||||
def __init__(self,
|
||||
observation_space,
|
||||
action_space,
|
||||
config,
|
||||
existing_inputs=None):
|
||||
"""
|
||||
Arguments:
|
||||
observation_space: Environment observation space specification.
|
||||
@@ -98,16 +110,18 @@ class PPOPolicyGraph(TFPolicyGraph):
|
||||
existing_seq_lens = existing_inputs[-1]
|
||||
else:
|
||||
obs_ph = tf.placeholder(
|
||||
tf.float32, name="obs", shape=(None,)+observation_space.shape)
|
||||
tf.float32,
|
||||
name="obs",
|
||||
shape=(None, ) + observation_space.shape)
|
||||
adv_ph = tf.placeholder(
|
||||
tf.float32, name="advantages", shape=(None,))
|
||||
tf.float32, name="advantages", shape=(None, ))
|
||||
act_ph = ModelCatalog.get_action_placeholder(action_space)
|
||||
logits_ph = tf.placeholder(
|
||||
tf.float32, name="logits", shape=(None, logit_dim))
|
||||
vf_preds_ph = tf.placeholder(
|
||||
tf.float32, name="vf_preds", shape=(None,))
|
||||
tf.float32, name="vf_preds", shape=(None, ))
|
||||
value_targets_ph = tf.placeholder(
|
||||
tf.float32, name="value_targets", shape=(None,))
|
||||
tf.float32, name="value_targets", shape=(None, ))
|
||||
existing_state_in = None
|
||||
existing_seq_lens = None
|
||||
|
||||
@@ -120,13 +134,19 @@ class PPOPolicyGraph(TFPolicyGraph):
|
||||
("vf_preds", vf_preds_ph),
|
||||
]
|
||||
self.model = ModelCatalog.get_model(
|
||||
obs_ph, logit_dim, self.config["model"],
|
||||
state_in=existing_state_in, seq_lens=existing_seq_lens)
|
||||
obs_ph,
|
||||
logit_dim,
|
||||
self.config["model"],
|
||||
state_in=existing_state_in,
|
||||
seq_lens=existing_seq_lens)
|
||||
|
||||
# KL Coefficient
|
||||
self.kl_coeff = tf.get_variable(
|
||||
initializer=tf.constant_initializer(self.kl_coeff_val),
|
||||
name="kl_coeff", shape=(), trainable=False, dtype=tf.float32)
|
||||
name="kl_coeff",
|
||||
shape=(),
|
||||
trainable=False,
|
||||
dtype=tf.float32)
|
||||
|
||||
self.logits = self.model.outputs
|
||||
curr_action_dist = dist_cls(self.logits)
|
||||
@@ -146,20 +166,32 @@ class PPOPolicyGraph(TFPolicyGraph):
|
||||
self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])
|
||||
|
||||
self.loss_obj = PPOLoss(
|
||||
action_space, value_targets_ph, adv_ph, act_ph,
|
||||
logits_ph, vf_preds_ph,
|
||||
curr_action_dist, self.value_function, self.kl_coeff,
|
||||
action_space,
|
||||
value_targets_ph,
|
||||
adv_ph,
|
||||
act_ph,
|
||||
logits_ph,
|
||||
vf_preds_ph,
|
||||
curr_action_dist,
|
||||
self.value_function,
|
||||
self.kl_coeff,
|
||||
entropy_coeff=self.config["entropy_coeff"],
|
||||
clip_param=self.config["clip_param"],
|
||||
vf_loss_coeff=self.config["kl_target"],
|
||||
use_gae=self.config["use_gae"])
|
||||
|
||||
TFPolicyGraph.__init__(
|
||||
self, observation_space, action_space,
|
||||
self.sess, obs_input=obs_ph,
|
||||
action_sampler=self.sampler, loss=self.loss_obj.loss,
|
||||
loss_inputs=self.loss_in, state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out, seq_lens=self.model.seq_lens,
|
||||
self,
|
||||
observation_space,
|
||||
action_space,
|
||||
self.sess,
|
||||
obs_input=obs_ph,
|
||||
action_sampler=self.sampler,
|
||||
loss=self.loss_obj.loss,
|
||||
loss_inputs=self.loss_in,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=config["model"]["max_seq_len"])
|
||||
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
@@ -167,7 +199,9 @@ class PPOPolicyGraph(TFPolicyGraph):
|
||||
def copy(self, existing_inputs):
|
||||
"""Creates a copy of self using existing input placeholders."""
|
||||
return PPOPolicyGraph(
|
||||
None, self.action_space, self.config,
|
||||
None,
|
||||
self.action_space,
|
||||
self.config,
|
||||
existing_inputs=existing_inputs)
|
||||
|
||||
def extra_compute_action_fetches(self):
|
||||
@@ -193,8 +227,11 @@ class PPOPolicyGraph(TFPolicyGraph):
|
||||
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
|
||||
last_r = 0.0
|
||||
batch = compute_advantages(
|
||||
sample_batch, last_r, self.config["gamma"],
|
||||
self.config["lambda"], use_gae=self.config["use_gae"])
|
||||
sample_batch,
|
||||
last_r,
|
||||
self.config["gamma"],
|
||||
self.config["lambda"],
|
||||
use_gae=self.config["use_gae"])
|
||||
return batch
|
||||
|
||||
def optimizer(self):
|
||||
|
||||
@@ -13,7 +13,6 @@ from ray.rllib.agents.ppo.utils import flatten, concatenate
|
||||
|
||||
# TODO(ekl): move to rllib/models dir
|
||||
class DistributionsTest(unittest.TestCase):
|
||||
|
||||
def testCategorical(self):
|
||||
num_samples = 100000
|
||||
logits = tf.placeholder(tf.float32, shape=(None, 10))
|
||||
@@ -32,10 +31,11 @@ class DistributionsTest(unittest.TestCase):
|
||||
|
||||
|
||||
class UtilsTest(unittest.TestCase):
|
||||
|
||||
def testFlatten(self):
|
||||
d = {"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
|
||||
"a": np.array([[[5], [-5]], [[6], [-6]]])}
|
||||
d = {
|
||||
"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
|
||||
"a": np.array([[[5], [-5]], [[6], [-6]]])
|
||||
}
|
||||
flat = flatten(d.copy(), start=0, stop=2)
|
||||
assert_allclose(d["s"][0][0][:], flat["s"][0][:])
|
||||
assert_allclose(d["s"][0][1][:], flat["s"][1][:])
|
||||
|
||||
@@ -16,7 +16,7 @@ def flatten(weights, start=0, stop=2):
|
||||
stop: The ending index.
|
||||
"""
|
||||
for key, val in weights.items():
|
||||
new_shape = val.shape[0:start] + (-1,) + val.shape[stop:]
|
||||
new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
|
||||
weights[key] = val.reshape(new_shape)
|
||||
return weights
|
||||
|
||||
|
||||
+9
-6
@@ -286,8 +286,8 @@ class _MultiAgentEnvState(object):
|
||||
self.reset()
|
||||
|
||||
def poll(self):
|
||||
obs, rew, dones, info = (
|
||||
self.last_obs, self.last_rewards, self.last_dones, self.last_infos)
|
||||
obs, rew, dones, info = (self.last_obs, self.last_rewards,
|
||||
self.last_dones, self.last_infos)
|
||||
self.last_obs = {}
|
||||
self.last_rewards = {}
|
||||
self.last_dones = {"__all__": False}
|
||||
@@ -303,10 +303,13 @@ class _MultiAgentEnvState(object):
|
||||
def reset(self):
|
||||
self.last_obs = self.env.reset()
|
||||
self.last_rewards = {
|
||||
agent_id: None for agent_id in self.last_obs.keys()}
|
||||
agent_id: None
|
||||
for agent_id in self.last_obs.keys()
|
||||
}
|
||||
self.last_dones = {
|
||||
agent_id: False for agent_id in self.last_obs.keys()}
|
||||
self.last_infos = {
|
||||
agent_id: {} for agent_id in self.last_obs.keys()}
|
||||
agent_id: False
|
||||
for agent_id in self.last_obs.keys()
|
||||
}
|
||||
self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
|
||||
self.last_dones["__all__"] = False
|
||||
return self.last_obs
|
||||
|
||||
+2
-3
@@ -28,8 +28,7 @@ class NoopResetEnv(gym.Wrapper):
|
||||
if self.override_num_noops is not None:
|
||||
noops = self.override_num_noops
|
||||
else:
|
||||
noops = self.unwrapped.np_random.randint(
|
||||
1, self.noop_max + 1)
|
||||
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
|
||||
assert noops > 0
|
||||
obs = None
|
||||
for _ in range(noops):
|
||||
@@ -121,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):
|
||||
gym.Wrapper.__init__(self, env)
|
||||
# most recent raw observations (for max pooling across time steps)
|
||||
self._obs_buffer = np.zeros(
|
||||
(2,)+env.observation_space.shape, dtype=np.uint8)
|
||||
(2, ) + env.observation_space.shape, dtype=np.uint8)
|
||||
self._skip = skip
|
||||
|
||||
def step(self, action):
|
||||
|
||||
Vendored
+1
-2
@@ -71,8 +71,7 @@ class _VectorizedGymEnv(VectorEnv):
|
||||
self.envs = existing_envs
|
||||
self.num_envs = num_envs
|
||||
if make_env and num_envs > 1:
|
||||
self.resetter = _AsyncResetter(
|
||||
make_env, int(self.num_envs ** 0.5))
|
||||
self.resetter = _AsyncResetter(make_env, int(self.num_envs**0.5))
|
||||
else:
|
||||
self.resetter = _SimpleResetter(make_env)
|
||||
while len(self.envs) < self.num_envs:
|
||||
|
||||
@@ -15,9 +15,10 @@ def collect_metrics(local_evaluator, remote_evaluators=[]):
|
||||
episode_rewards = []
|
||||
episode_lengths = []
|
||||
policy_rewards = collections.defaultdict(list)
|
||||
metric_lists = ray.get(
|
||||
[a.apply.remote(lambda ev: ev.sampler.get_metrics())
|
||||
for a in remote_evaluators])
|
||||
metric_lists = ray.get([
|
||||
a.apply.remote(lambda ev: ev.sampler.get_metrics())
|
||||
for a in remote_evaluators
|
||||
])
|
||||
metric_lists.append(local_evaluator.sampler.get_metrics())
|
||||
for metrics in metric_lists:
|
||||
for episode in metrics:
|
||||
|
||||
@@ -82,24 +82,23 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
def as_remote(cls, num_cpus=None, num_gpus=None):
|
||||
return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
env_creator,
|
||||
policy_graph,
|
||||
policy_mapping_fn=None,
|
||||
tf_session_creator=None,
|
||||
batch_steps=100,
|
||||
batch_mode="truncate_episodes",
|
||||
episode_horizon=None,
|
||||
preprocessor_pref="rllib",
|
||||
sample_async=False,
|
||||
compress_observations=False,
|
||||
num_envs=1,
|
||||
observation_filter="NoFilter",
|
||||
env_config=None,
|
||||
model_config=None,
|
||||
policy_config=None,
|
||||
worker_index=0):
|
||||
def __init__(self,
|
||||
env_creator,
|
||||
policy_graph,
|
||||
policy_mapping_fn=None,
|
||||
tf_session_creator=None,
|
||||
batch_steps=100,
|
||||
batch_mode="truncate_episodes",
|
||||
episode_horizon=None,
|
||||
preprocessor_pref="rllib",
|
||||
sample_async=False,
|
||||
compress_observations=False,
|
||||
num_envs=1,
|
||||
observation_filter="NoFilter",
|
||||
env_config=None,
|
||||
model_config=None,
|
||||
policy_config=None,
|
||||
worker_index=0):
|
||||
"""Initialize a policy evaluator.
|
||||
|
||||
Arguments:
|
||||
@@ -157,8 +156,8 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
policy_config = policy_config or {}
|
||||
self.policy_config = policy_config
|
||||
model_config = model_config or {}
|
||||
policy_mapping_fn = (
|
||||
policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID))
|
||||
policy_mapping_fn = (policy_mapping_fn
|
||||
or (lambda agent_id: DEFAULT_POLICY_ID))
|
||||
self.env_creator = env_creator
|
||||
self.policy_graph = policy_graph
|
||||
self.batch_steps = batch_steps
|
||||
@@ -170,17 +169,21 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
isinstance(self.env, ServingEnv) or \
|
||||
isinstance(self.env, MultiAgentEnv) or \
|
||||
isinstance(self.env, AsyncVectorEnv):
|
||||
|
||||
def wrap(env):
|
||||
return env # we can't auto-wrap these env types
|
||||
elif is_atari(self.env) and \
|
||||
"custom_preprocessor" not in model_config and \
|
||||
preprocessor_pref == "deepmind":
|
||||
|
||||
def wrap(env):
|
||||
return wrap_deepmind(env, dim=model_config.get("dim", 80))
|
||||
else:
|
||||
|
||||
def wrap(env):
|
||||
return ModelCatalog.get_preprocessor_as_wrapper(
|
||||
env, model_config)
|
||||
|
||||
self.env = wrap(self.env)
|
||||
|
||||
def make_env():
|
||||
@@ -193,20 +196,21 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
if tf_session_creator:
|
||||
self.tf_sess = tf_session_creator()
|
||||
else:
|
||||
self.tf_sess = tf.Session(config=tf.ConfigProto(
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
self.tf_sess = tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
with self.tf_sess.as_default():
|
||||
self.policy_map = self._build_policy_map(
|
||||
policy_dict, policy_config)
|
||||
else:
|
||||
self.policy_map = self._build_policy_map(
|
||||
policy_dict, policy_config)
|
||||
self.policy_map = self._build_policy_map(policy_dict,
|
||||
policy_config)
|
||||
|
||||
self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID)
|
||||
|
||||
self.filters = {
|
||||
policy_id: get_filter(
|
||||
observation_filter, policy.observation_space.shape)
|
||||
policy_id: get_filter(observation_filter,
|
||||
policy.observation_space.shape)
|
||||
for (policy_id, policy) in self.policy_map.items()
|
||||
}
|
||||
|
||||
@@ -226,24 +230,34 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
batch_steps = float("inf") # never cut episodes
|
||||
pack_episodes = False # sampler will return 1 episode per poll
|
||||
else:
|
||||
raise ValueError(
|
||||
"Unsupported batch mode: {}".format(self.batch_mode))
|
||||
raise ValueError("Unsupported batch mode: {}".format(
|
||||
self.batch_mode))
|
||||
if sample_async:
|
||||
self.sampler = AsyncSampler(
|
||||
self.async_env, self.policy_map, policy_mapping_fn,
|
||||
self.filters, batch_steps, horizon=episode_horizon,
|
||||
pack=pack_episodes, tf_sess=self.tf_sess)
|
||||
self.async_env,
|
||||
self.policy_map,
|
||||
policy_mapping_fn,
|
||||
self.filters,
|
||||
batch_steps,
|
||||
horizon=episode_horizon,
|
||||
pack=pack_episodes,
|
||||
tf_sess=self.tf_sess)
|
||||
self.sampler.start()
|
||||
else:
|
||||
self.sampler = SyncSampler(
|
||||
self.async_env, self.policy_map, policy_mapping_fn,
|
||||
self.filters, batch_steps, horizon=episode_horizon,
|
||||
pack=pack_episodes, tf_sess=self.tf_sess)
|
||||
self.async_env,
|
||||
self.policy_map,
|
||||
policy_mapping_fn,
|
||||
self.filters,
|
||||
batch_steps,
|
||||
horizon=episode_horizon,
|
||||
pack=pack_episodes,
|
||||
tf_sess=self.tf_sess)
|
||||
|
||||
def _build_policy_map(self, policy_dict, policy_config):
|
||||
policy_map = {}
|
||||
for name, (cls, obs_space, act_space, conf) in sorted(
|
||||
policy_dict.items()):
|
||||
for name, (cls, obs_space, act_space,
|
||||
conf) in sorted(policy_dict.items()):
|
||||
merged_conf = policy_config.copy()
|
||||
merged_conf.update(conf)
|
||||
with tf.variable_scope(name):
|
||||
@@ -315,7 +329,8 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
def get_weights(self):
|
||||
return {
|
||||
pid: policy.get_weights()
|
||||
for pid, policy in self.policy_map.items()}
|
||||
for pid, policy in self.policy_map.items()
|
||||
}
|
||||
|
||||
def set_weights(self, weights):
|
||||
for pid, w in weights.items():
|
||||
@@ -351,9 +366,7 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
builder, grad)
|
||||
for pid, grad in grads.items()
|
||||
}
|
||||
return {
|
||||
k: builder.get(v) for k, v in outputs.items()
|
||||
}
|
||||
return {k: builder.get(v) for k, v in outputs.items()}
|
||||
else:
|
||||
return {
|
||||
pid: self.policy_map[pid].apply_gradients(g)
|
||||
@@ -428,8 +441,9 @@ def _validate_and_canonicalize(policy_graph, env):
|
||||
raise ValueError("policy_graph must be a rllib.PolicyGraph class")
|
||||
else:
|
||||
return {
|
||||
DEFAULT_POLICY_ID: (
|
||||
policy_graph, env.observation_space, env.action_space, {})}
|
||||
DEFAULT_POLICY_ID: (policy_graph, env.observation_space,
|
||||
env.action_space, {})
|
||||
}
|
||||
|
||||
|
||||
def _has_tensorflow_graph(policy_dict):
|
||||
|
||||
@@ -45,7 +45,8 @@ class SampleBatchBuilder(object):
|
||||
"""Returns a sample batch including all previously added values."""
|
||||
|
||||
batch = SampleBatch(
|
||||
{k: to_float_array(v) for k, v in self.buffers.items()})
|
||||
{k: to_float_array(v)
|
||||
for k, v in self.buffers.items()})
|
||||
self.buffers.clear()
|
||||
self.count = 0
|
||||
return batch
|
||||
@@ -69,7 +70,9 @@ class MultiAgentSampleBatchBuilder(object):
|
||||
|
||||
self.policy_map = policy_map
|
||||
self.policy_builders = {
|
||||
k: SampleBatchBuilder() for k in policy_map.keys()}
|
||||
k: SampleBatchBuilder()
|
||||
for k in policy_map.keys()
|
||||
}
|
||||
self.agent_builders = {}
|
||||
self.agent_to_policy = {}
|
||||
self.count = 0 # increment this manually
|
||||
|
||||
@@ -12,12 +12,11 @@ from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \
|
||||
from ray.rllib.env.async_vector_env import AsyncVectorEnv
|
||||
from ray.rllib.utils.tf_run_builder import TFRunBuilder
|
||||
|
||||
|
||||
RolloutMetrics = namedtuple(
|
||||
"RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"])
|
||||
|
||||
PolicyEvalData = namedtuple(
|
||||
"PolicyEvalData", ["env_id", "agent_id", "obs", "rnn_state"])
|
||||
PolicyEvalData = namedtuple("PolicyEvalData",
|
||||
["env_id", "agent_id", "obs", "rnn_state"])
|
||||
|
||||
|
||||
class SyncSampler(object):
|
||||
@@ -29,9 +28,15 @@ class SyncSampler(object):
|
||||
This class provides data on invocation, rather than on a separate
|
||||
thread."""
|
||||
|
||||
def __init__(
|
||||
self, env, policies, policy_mapping_fn, obs_filters,
|
||||
num_local_steps, horizon=None, pack=False, tf_sess=None):
|
||||
def __init__(self,
|
||||
env,
|
||||
policies,
|
||||
policy_mapping_fn,
|
||||
obs_filters,
|
||||
num_local_steps,
|
||||
horizon=None,
|
||||
pack=False,
|
||||
tf_sess=None):
|
||||
self.async_vector_env = AsyncVectorEnv.wrap_async(env)
|
||||
self.num_local_steps = num_local_steps
|
||||
self.horizon = horizon
|
||||
@@ -68,9 +73,15 @@ class AsyncSampler(threading.Thread):
|
||||
Note that batch_size is only a unit of measure here. Batches can
|
||||
accumulate and the gradient can be calculated on up to 5 batches."""
|
||||
|
||||
def __init__(
|
||||
self, env, policies, policy_mapping_fn, obs_filters,
|
||||
num_local_steps, horizon=None, pack=False, tf_sess=None):
|
||||
def __init__(self,
|
||||
env,
|
||||
policies,
|
||||
policy_mapping_fn,
|
||||
obs_filters,
|
||||
num_local_steps,
|
||||
horizon=None,
|
||||
pack=False,
|
||||
tf_sess=None):
|
||||
for _, f in obs_filters.items():
|
||||
assert getattr(f, "is_concurrent", False), \
|
||||
"Observation Filter must support concurrent updates."
|
||||
@@ -142,9 +153,14 @@ class AsyncSampler(threading.Thread):
|
||||
return completed
|
||||
|
||||
|
||||
def _env_runner(
|
||||
async_vector_env, policies, policy_mapping_fn, num_local_steps,
|
||||
horizon, obs_filters, pack, tf_sess=None):
|
||||
def _env_runner(async_vector_env,
|
||||
policies,
|
||||
policy_mapping_fn,
|
||||
num_local_steps,
|
||||
horizon,
|
||||
obs_filters,
|
||||
pack,
|
||||
tf_sess=None):
|
||||
"""This implements the common experience collection logic.
|
||||
|
||||
Args:
|
||||
@@ -186,9 +202,11 @@ def _env_runner(
|
||||
else:
|
||||
return MultiAgentSampleBatchBuilder(policies)
|
||||
|
||||
active_episodes = defaultdict(
|
||||
lambda: _MultiAgentEpisode(
|
||||
policies, policy_mapping_fn, get_batch_builder))
|
||||
def new_episode():
|
||||
return _MultiAgentEpisode(policies, policy_mapping_fn,
|
||||
get_batch_builder)
|
||||
|
||||
active_episodes = defaultdict(new_episode)
|
||||
|
||||
while True:
|
||||
# Get observations from all ready agents
|
||||
@@ -213,9 +231,8 @@ def _env_runner(
|
||||
# Check episode termination conditions
|
||||
if dones[env_id]["__all__"] or episode.length >= horizon:
|
||||
all_done = True
|
||||
yield RolloutMetrics(
|
||||
episode.length, episode.total_reward,
|
||||
dict(episode.agent_rewards))
|
||||
yield RolloutMetrics(episode.length, episode.total_reward,
|
||||
dict(episode.agent_rewards))
|
||||
else:
|
||||
all_done = False
|
||||
# At least send an empty dict if not done
|
||||
@@ -228,9 +245,8 @@ def _env_runner(
|
||||
agent_done = bool(all_done or dones[env_id].get(agent_id))
|
||||
if not agent_done:
|
||||
to_eval[policy_id].append(
|
||||
PolicyEvalData(
|
||||
env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id)))
|
||||
PolicyEvalData(env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id)))
|
||||
|
||||
last_observation = episode.last_observation_for(agent_id)
|
||||
episode.set_last_observation(agent_id, filtered_obs)
|
||||
@@ -274,13 +290,12 @@ def _env_runner(
|
||||
episode = active_episodes[env_id]
|
||||
for agent_id, raw_obs in resetted_obs.items():
|
||||
policy_id = episode.policy_for(agent_id)
|
||||
filtered_obs = _get_or_raise(
|
||||
obs_filters, policy_id)(raw_obs)
|
||||
filtered_obs = _get_or_raise(obs_filters,
|
||||
policy_id)(raw_obs)
|
||||
episode.set_last_observation(agent_id, filtered_obs)
|
||||
to_eval[policy_id].append(
|
||||
PolicyEvalData(
|
||||
env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id)))
|
||||
PolicyEvalData(env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id)))
|
||||
|
||||
# Batch eval policy actions if possible
|
||||
if tf_sess:
|
||||
@@ -295,7 +310,8 @@ def _env_runner(
|
||||
policy = _get_or_raise(policies, policy_id)
|
||||
if builder:
|
||||
eval_results[policy_id] = policy.build_compute_actions(
|
||||
builder, [t.obs for t in eval_data], rnn_in,
|
||||
builder, [t.obs for t in eval_data],
|
||||
rnn_in,
|
||||
is_training=True)
|
||||
else:
|
||||
eval_results[policy_id] = policy.compute_actions(
|
||||
@@ -319,7 +335,8 @@ def _env_runner(
|
||||
episode = active_episodes[env_id]
|
||||
episode.set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
|
||||
episode.set_last_pi_info(
|
||||
agent_id, {k: v[i] for k, v in pi_info_cols.items()})
|
||||
agent_id, {k: v[i]
|
||||
for k, v in pi_info_cols.items()})
|
||||
if env_id in off_policy_actions and \
|
||||
agent_id in off_policy_actions[env_id]:
|
||||
episode.set_last_action(
|
||||
@@ -334,8 +351,7 @@ def _env_runner(
|
||||
|
||||
def _to_column_format(rnn_state_rows):
|
||||
num_cols = len(rnn_state_rows[0])
|
||||
return [
|
||||
[row[i] for row in rnn_state_rows] for i in range(num_cols)]
|
||||
return [[row[i] for row in rnn_state_rows] for i in range(num_cols)]
|
||||
|
||||
|
||||
def _get_or_raise(mapping, policy_id):
|
||||
@@ -363,8 +379,8 @@ class _MultiAgentEpisode(object):
|
||||
def add_agent_rewards(self, reward_dict):
|
||||
for agent_id, reward in reward_dict.items():
|
||||
if reward is not None:
|
||||
self.agent_rewards[
|
||||
agent_id, self.policy_for(agent_id)] += reward
|
||||
self.agent_rewards[agent_id,
|
||||
self.policy_for(agent_id)] += reward
|
||||
self.total_reward += reward
|
||||
|
||||
def policy_for(self, agent_id):
|
||||
|
||||
@@ -35,10 +35,18 @@ class TFPolicyGraph(PolicyGraph):
|
||||
SampleBatch({"action": ..., "advantages": ..., ...})
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, observation_space, action_space, sess, obs_input,
|
||||
action_sampler, loss, loss_inputs, state_inputs=None,
|
||||
state_outputs=None, seq_lens=None, max_seq_len=20):
|
||||
def __init__(self,
|
||||
observation_space,
|
||||
action_space,
|
||||
sess,
|
||||
obs_input,
|
||||
action_sampler,
|
||||
loss,
|
||||
loss_inputs,
|
||||
state_inputs=None,
|
||||
state_outputs=None,
|
||||
seq_lens=None,
|
||||
max_seq_len=20):
|
||||
"""Initialize the policy graph.
|
||||
|
||||
Arguments:
|
||||
@@ -78,9 +86,9 @@ class TFPolicyGraph(PolicyGraph):
|
||||
self._seq_lens = seq_lens
|
||||
self._max_seq_len = max_seq_len
|
||||
self._optimizer = self.optimizer()
|
||||
self._grads_and_vars = [
|
||||
(g, v) for (g, v) in self.gradients(self._optimizer)
|
||||
if g is not None]
|
||||
self._grads_and_vars = [(g, v)
|
||||
for (g, v) in self.gradients(self._optimizer)
|
||||
if g is not None]
|
||||
self._grads = [g for (g, v) in self._grads_and_vars]
|
||||
self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
|
||||
self._variables = ray.experimental.TensorFlowVariables(
|
||||
@@ -92,8 +100,11 @@ class TFPolicyGraph(PolicyGraph):
|
||||
if self._state_inputs:
|
||||
assert self._seq_lens is not None
|
||||
|
||||
def build_compute_actions(
|
||||
self, builder, obs_batch, state_batches=None, is_training=False):
|
||||
def build_compute_actions(self,
|
||||
builder,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
is_training=False):
|
||||
state_batches = state_batches or []
|
||||
assert len(self._state_inputs) == len(state_batches), \
|
||||
(self._state_inputs, state_batches)
|
||||
@@ -103,16 +114,15 @@ class TFPolicyGraph(PolicyGraph):
|
||||
builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
|
||||
builder.add_feed_dict({self._is_training: is_training})
|
||||
builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
|
||||
fetches = builder.add_fetches(
|
||||
[self._sampler] + self._state_outputs +
|
||||
[self.extra_compute_action_fetches()])
|
||||
fetches = builder.add_fetches([self._sampler] + self._state_outputs +
|
||||
[self.extra_compute_action_fetches()])
|
||||
return fetches[0], fetches[1:-1], fetches[-1]
|
||||
|
||||
def compute_actions(
|
||||
self, obs_batch, state_batches=None, is_training=False):
|
||||
def compute_actions(self, obs_batch, state_batches=None,
|
||||
is_training=False):
|
||||
builder = TFRunBuilder(self._sess, "compute_actions")
|
||||
fetches = self.build_compute_actions(
|
||||
builder, obs_batch, state_batches, is_training)
|
||||
fetches = self.build_compute_actions(builder, obs_batch, state_batches,
|
||||
is_training)
|
||||
return builder.get(fetches)
|
||||
|
||||
def _get_loss_inputs_dict(self, batch):
|
||||
@@ -127,12 +137,11 @@ class TFPolicyGraph(PolicyGraph):
|
||||
# RNN case
|
||||
feature_keys = [k for k, v in self._loss_inputs]
|
||||
state_keys = [
|
||||
"state_in_{}".format(i) for i in range(len(self._state_inputs))]
|
||||
"state_in_{}".format(i) for i in range(len(self._state_inputs))
|
||||
]
|
||||
feature_sequences, initial_states, seq_lens = chop_into_sequences(
|
||||
batch["t"],
|
||||
[batch[k] for k in feature_keys],
|
||||
[batch[k] for k in state_keys],
|
||||
self._max_seq_len)
|
||||
batch["t"], [batch[k] for k in feature_keys],
|
||||
[batch[k] for k in state_keys], self._max_seq_len)
|
||||
for k, v in zip(feature_keys, feature_sequences):
|
||||
feed_dict[self._loss_input_dict[k]] = v
|
||||
for k, v in zip(state_keys, initial_states):
|
||||
@@ -172,9 +181,11 @@ class TFPolicyGraph(PolicyGraph):
|
||||
builder.add_feed_dict(self.extra_apply_grad_feed_dict())
|
||||
builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
|
||||
builder.add_feed_dict({self._is_training: True})
|
||||
fetches = builder.add_fetches(
|
||||
[self._apply_op, self.extra_compute_grad_fetches(),
|
||||
self.extra_apply_grad_fetches()])
|
||||
fetches = builder.add_fetches([
|
||||
self._apply_op,
|
||||
self.extra_compute_grad_fetches(),
|
||||
self.extra_apply_grad_fetches()
|
||||
])
|
||||
return fetches[1], fetches[2]
|
||||
|
||||
def compute_apply(self, postprocessed_batch):
|
||||
|
||||
@@ -27,8 +27,8 @@ class TorchPolicyGraph(PolicyGraph):
|
||||
This is necessary when using the async sampler.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, observation_space, action_space, model, loss, loss_inputs):
|
||||
def __init__(self, observation_space, action_space, model, loss,
|
||||
loss_inputs):
|
||||
"""Build a policy graph from policy and loss torch modules.
|
||||
|
||||
Note that module inputs will be CPU tensors. The model and loss modules
|
||||
@@ -67,8 +67,8 @@ class TorchPolicyGraph(PolicyGraph):
|
||||
"""Custom PyTorch optimizer to use."""
|
||||
return torch.optim.Adam(self._model.parameters())
|
||||
|
||||
def compute_actions(
|
||||
self, obs_batch, state_batches=None, is_training=False):
|
||||
def compute_actions(self, obs_batch, state_batches=None,
|
||||
is_training=False):
|
||||
if state_batches:
|
||||
raise NotImplementedError("Torch RNN support")
|
||||
with self.lock:
|
||||
|
||||
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
|
||||
global env_version_num
|
||||
|
||||
register(
|
||||
id=env_name,
|
||||
entry_point=(
|
||||
"ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
|
||||
"MultiAgentMountainCarEnv"),
|
||||
max_episode_steps=200,
|
||||
kwargs={}
|
||||
)
|
||||
id=env_name,
|
||||
entry_point=(
|
||||
"ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
|
||||
"MultiAgentMountainCarEnv"),
|
||||
max_episode_steps=200,
|
||||
kwargs={})
|
||||
|
||||
|
||||
def create_env(env_config):
|
||||
@@ -48,10 +47,12 @@ if __name__ == '__main__':
|
||||
config["horizon"] = horizon
|
||||
config["use_gae"] = False
|
||||
config["model"].update({"fcnet_hiddens": [256, 256]})
|
||||
options = {"multiagent_obs_shapes": [2, 2],
|
||||
"multiagent_act_shapes": [1, 1],
|
||||
"multiagent_shared_model": False,
|
||||
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
|
||||
options = {
|
||||
"multiagent_obs_shapes": [2, 2],
|
||||
"multiagent_act_shapes": [1, 1],
|
||||
"multiagent_shared_model": False,
|
||||
"multiagent_fcnet_hiddens": [[32, 32]] * 2
|
||||
}
|
||||
config["model"].update({"custom_options": options})
|
||||
alg = ppo.PPOAgent(env=env_name, config=config)
|
||||
for i in range(1):
|
||||
|
||||
@@ -2,7 +2,6 @@ from math import cos
|
||||
from gym.spaces import Box, Tuple, Discrete
|
||||
import numpy as np
|
||||
from gym.envs.classic_control.mountain_car import MountainCarEnv
|
||||
|
||||
"""
|
||||
Multiagent mountain car that sums and then
|
||||
averages its actions to produce the velocity
|
||||
@@ -22,8 +21,8 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
|
||||
self.viewer = None
|
||||
|
||||
self.action_space = [Discrete(3) for _ in range(2)]
|
||||
self.observation_space = Tuple([
|
||||
Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
|
||||
self.observation_space = Tuple(
|
||||
[Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
|
||||
|
||||
self.seed()
|
||||
self.reset()
|
||||
|
||||
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
|
||||
global env_version_num
|
||||
|
||||
register(
|
||||
id=env_name,
|
||||
entry_point=(
|
||||
"ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
|
||||
"MultiAgentPendulumEnv"),
|
||||
max_episode_steps=100,
|
||||
kwargs={}
|
||||
)
|
||||
id=env_name,
|
||||
entry_point=(
|
||||
"ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
|
||||
"MultiAgentPendulumEnv"),
|
||||
max_episode_steps=100,
|
||||
kwargs={})
|
||||
|
||||
|
||||
def create_env(env_config):
|
||||
@@ -49,10 +48,12 @@ if __name__ == '__main__':
|
||||
config["horizon"] = horizon
|
||||
config["use_gae"] = True
|
||||
config["model"].update({"fcnet_hiddens": [256, 256]})
|
||||
options = {"multiagent_obs_shapes": [3, 3],
|
||||
"multiagent_act_shapes": [1, 1],
|
||||
"multiagent_shared_model": True,
|
||||
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
|
||||
options = {
|
||||
"multiagent_obs_shapes": [3, 3],
|
||||
"multiagent_act_shapes": [1, 1],
|
||||
"multiagent_shared_model": True,
|
||||
"multiagent_fcnet_hiddens": [[32, 32]] * 2
|
||||
}
|
||||
config["model"].update({"custom_options": options})
|
||||
alg = ppo.PPOAgent(env=env_name, config=config)
|
||||
for i in range(1):
|
||||
|
||||
@@ -2,7 +2,6 @@ from gym.spaces import Box, Tuple
|
||||
from gym.utils import seeding
|
||||
from gym.envs.classic_control.pendulum import PendulumEnv
|
||||
import numpy as np
|
||||
|
||||
"""
|
||||
Multiagent pendulum that sums its torques to generate an action
|
||||
"""
|
||||
@@ -10,8 +9,8 @@ import numpy as np
|
||||
|
||||
class MultiAgentPendulumEnv(PendulumEnv):
|
||||
metadata = {
|
||||
'render.modes': ['human', 'rgb_array'],
|
||||
'video.frames_per_second': 30
|
||||
'render.modes': ['human', 'rgb_array'],
|
||||
'video.frames_per_second': 30
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
@@ -21,13 +20,14 @@ class MultiAgentPendulumEnv(PendulumEnv):
|
||||
self.viewer = None
|
||||
|
||||
high = np.array([1., 1., self.max_speed])
|
||||
self.action_space = [Box(low=-self.max_torque / 2,
|
||||
high=self.max_torque / 2,
|
||||
shape=(1,),
|
||||
dtype=np.float32)
|
||||
for _ in range(2)]
|
||||
self.observation_space = Tuple([
|
||||
Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
|
||||
self.action_space = [
|
||||
Box(low=-self.max_torque / 2,
|
||||
high=self.max_torque / 2,
|
||||
shape=(1, ),
|
||||
dtype=np.float32) for _ in range(2)
|
||||
]
|
||||
self.observation_space = Tuple(
|
||||
[Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
|
||||
|
||||
self.seed()
|
||||
|
||||
@@ -49,8 +49,8 @@ class MultiAgentPendulumEnv(PendulumEnv):
|
||||
costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
|
||||
.001 * (summed_u ** 2)
|
||||
|
||||
newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
|
||||
3. / (m * length ** 2) * summed_u) * dt
|
||||
newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + 3. /
|
||||
(m * length**2) * summed_u) * dt
|
||||
newth = th + newthdot * dt
|
||||
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
|
||||
|
||||
@@ -65,8 +65,10 @@ class MultiAgentPendulumEnv(PendulumEnv):
|
||||
|
||||
def _get_obs(self):
|
||||
theta, thetadot = self.state
|
||||
return [np.array([np.cos(theta), np.sin(theta), thetadot])
|
||||
for _ in range(2)]
|
||||
return [
|
||||
np.array([np.cos(theta), np.sin(theta), thetadot])
|
||||
for _ in range(2)
|
||||
]
|
||||
|
||||
def angle_normalize(self, x):
|
||||
return (((x + np.pi) % (2 * np.pi)) - np.pi)
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
"""Simple example of setting up a multi-agent policy mapping.
|
||||
|
||||
Control the number of agents and policies via --num-agents and --num-policies.
|
||||
@@ -24,14 +23,12 @@ from ray.rllib.test.test_multi_agent_env import MultiCartpole
|
||||
from ray.tune.logger import pretty_print
|
||||
from ray.tune.registry import register_env
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--num-agents", type=int, default=4)
|
||||
parser.add_argument("--num-policies", type=int, default=2)
|
||||
parser.add_argument("--num-iters", type=int, default=20)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init()
|
||||
@@ -51,7 +48,8 @@ if __name__ == "__main__":
|
||||
|
||||
# Setup PG with an ensemble of `num_policies` different policy graphs
|
||||
policy_graphs = {
|
||||
"policy_{}".format(i): gen_policy() for i in range(args.num_policies)
|
||||
"policy_{}".format(i): gen_policy()
|
||||
for i in range(args.num_policies)
|
||||
}
|
||||
policy_ids = list(policy_graphs.keys())
|
||||
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
"""Example of querying a policy server. Copy this file for your use case.
|
||||
|
||||
To try this out, in two separate shells run:
|
||||
@@ -14,18 +13,19 @@ import gym
|
||||
|
||||
from ray.rllib.utils.policy_client import PolicyClient
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--no-train", action="store_true", help="Whether to disable training.")
|
||||
parser.add_argument(
|
||||
"--off-policy", action="store_true",
|
||||
"--off-policy",
|
||||
action="store_true",
|
||||
help="Whether to take random instead of on-policy actions.")
|
||||
parser.add_argument(
|
||||
"--stop-at-reward", type=int, default=9999,
|
||||
"--stop-at-reward",
|
||||
type=int,
|
||||
default=9999,
|
||||
help="Stop once the specified reward is reached.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
env = gym.make("CartPole-v0")
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
"""Example of running a policy server. Copy this file for your use case.
|
||||
|
||||
To try this out, in two separate shells run:
|
||||
@@ -26,12 +25,12 @@ CHECKPOINT_FILE = "last_checkpoint.out"
|
||||
|
||||
class CartpoleServing(ServingEnv):
|
||||
def __init__(self):
|
||||
ServingEnv.__init__(
|
||||
self, spaces.Discrete(2), spaces.Box(low=-10, high=10, shape=(4,)))
|
||||
ServingEnv.__init__(self, spaces.Discrete(2),
|
||||
spaces.Box(low=-10, high=10, shape=(4, )))
|
||||
|
||||
def run(self):
|
||||
print("Starting policy server at {}:{}".format(
|
||||
SERVER_ADDRESS, SERVER_PORT))
|
||||
print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
|
||||
SERVER_PORT))
|
||||
server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
|
||||
server.serve_forever()
|
||||
|
||||
@@ -42,14 +41,16 @@ if __name__ == "__main__":
|
||||
|
||||
# We use DQN since it supports off-policy actions, but you can choose and
|
||||
# configure any agent.
|
||||
dqn = DQNAgent(env="srv", config={
|
||||
# Use a single process to avoid needing to set up a load balancer
|
||||
"num_workers": 0,
|
||||
# Configure the agent to run short iterations for debugging
|
||||
"exploration_fraction": 0.01,
|
||||
"learning_starts": 100,
|
||||
"timesteps_per_iteration": 200,
|
||||
})
|
||||
dqn = DQNAgent(
|
||||
env="srv",
|
||||
config={
|
||||
# Use a single process to avoid needing to set up a load balancer
|
||||
"num_workers": 0,
|
||||
# Configure the agent to run short iterations for debugging
|
||||
"exploration_fraction": 0.01,
|
||||
"learning_starts": 100,
|
||||
"timesteps_per_iteration": 200,
|
||||
})
|
||||
|
||||
# Attempt to restore from checkpoint if possible.
|
||||
if os.path.exists(CHECKPOINT_FILE):
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.models.preprocessors import Preprocessor
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.lstm import LSTM
|
||||
|
||||
|
||||
__all__ = ["ActionDistribution", "Categorical",
|
||||
"DiagGaussian", "Deterministic", "ModelCatalog", "Model",
|
||||
"Preprocessor", "FullyConnectedNetwork", "LSTM"]
|
||||
__all__ = [
|
||||
"ActionDistribution", "Categorical", "DiagGaussian", "Deterministic",
|
||||
"ModelCatalog", "Model", "Preprocessor", "FullyConnectedNetwork", "LSTM"
|
||||
]
|
||||
|
||||
@@ -42,25 +42,25 @@ class Categorical(ActionDistribution):
|
||||
logits=self.inputs, labels=x)
|
||||
|
||||
def entropy(self):
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
|
||||
keepdims=True)
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
|
||||
|
||||
def kl(self, other):
|
||||
a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
|
||||
keepdims=True)
|
||||
a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1],
|
||||
keepdims=True)
|
||||
a0 = self.inputs - tf.reduce_max(
|
||||
self.inputs, reduction_indices=[1], keepdims=True)
|
||||
a1 = other.inputs - tf.reduce_max(
|
||||
other.inputs, reduction_indices=[1], keepdims=True)
|
||||
ea0 = tf.exp(a0)
|
||||
ea1 = tf.exp(a1)
|
||||
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
|
||||
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
|
||||
p0 = ea0 / z0
|
||||
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)),
|
||||
reduction_indices=[1])
|
||||
return tf.reduce_sum(
|
||||
p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])
|
||||
|
||||
def sample(self):
|
||||
return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
|
||||
@@ -90,22 +90,23 @@ class DiagGaussian(ActionDistribution):
|
||||
self.std = tf.exp(log_std)
|
||||
|
||||
def logp(self, x):
|
||||
return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std),
|
||||
reduction_indices=[1]) -
|
||||
return (-0.5 * tf.reduce_sum(
|
||||
tf.square((x - self.mean) / self.std), reduction_indices=[1]) -
|
||||
0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
|
||||
tf.reduce_sum(self.log_std, reduction_indices=[1]))
|
||||
|
||||
def kl(self, other):
|
||||
assert isinstance(other, DiagGaussian)
|
||||
return tf.reduce_sum(other.log_std - self.log_std +
|
||||
(tf.square(self.std) +
|
||||
tf.square(self.mean - other.mean)) /
|
||||
(2.0 * tf.square(other.std)) - 0.5,
|
||||
reduction_indices=[1])
|
||||
return tf.reduce_sum(
|
||||
other.log_std - self.log_std +
|
||||
(tf.square(self.std) + tf.square(self.mean - other.mean)) /
|
||||
(2.0 * tf.square(other.std)) - 0.5,
|
||||
reduction_indices=[1])
|
||||
|
||||
def entropy(self):
|
||||
return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e),
|
||||
reduction_indices=[1])
|
||||
return tf.reduce_sum(
|
||||
self.log_std + .5 * np.log(2.0 * np.pi * np.e),
|
||||
reduction_indices=[1])
|
||||
|
||||
def sample(self):
|
||||
out = self.mean + self.std * tf.random_normal(tf.shape(self.mean))
|
||||
@@ -158,6 +159,7 @@ class MultiActionDistribution(ActionDistribution):
|
||||
Args:
|
||||
inputs (Tensor list): A list of tensors from which to compute samples.
|
||||
"""
|
||||
|
||||
def __init__(self, inputs, action_space, child_distributions):
|
||||
# you actually have to instantiate the child distributions
|
||||
self.reshaper = Reshaper(action_space.spaces)
|
||||
@@ -174,23 +176,25 @@ class MultiActionDistribution(ActionDistribution):
|
||||
# Remove extra categorical dimension
|
||||
if isinstance(distribution, Categorical):
|
||||
split_list[i] = tf.squeeze(split_list[i], axis=-1)
|
||||
log_list = np.asarray([distribution.logp(split_x) for
|
||||
distribution, split_x in
|
||||
zip(self.child_distributions, split_list)])
|
||||
log_list = np.asarray([
|
||||
distribution.logp(split_x) for distribution, split_x in zip(
|
||||
self.child_distributions, split_list)
|
||||
])
|
||||
return np.sum(log_list)
|
||||
|
||||
def kl(self, other):
|
||||
"""The KL-divergence between two action distributions."""
|
||||
kl_list = np.asarray([distribution.kl(other_distribution) for
|
||||
distribution, other_distribution in
|
||||
zip(self.child_distributions,
|
||||
other.child_distributions)])
|
||||
kl_list = np.asarray([
|
||||
distribution.kl(other_distribution)
|
||||
for distribution, other_distribution in zip(
|
||||
self.child_distributions, other.child_distributions)
|
||||
])
|
||||
return np.sum(kl_list)
|
||||
|
||||
def entropy(self):
|
||||
"""The entropy of the action distribution."""
|
||||
entropy_list = np.array([s.entropy() for s in
|
||||
self.child_distributions])
|
||||
entropy_list = np.array(
|
||||
[s.entropy() for s in self.child_distributions])
|
||||
return np.sum(entropy_list)
|
||||
|
||||
def sample(self):
|
||||
|
||||
@@ -19,7 +19,6 @@ from ray.rllib.models.visionnet import VisionNetwork
|
||||
from ray.rllib.models.lstm import LSTM
|
||||
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
|
||||
|
||||
|
||||
MODEL_CONFIGS = [
|
||||
# === Built-in options ===
|
||||
"conv_filters", # Filter configuration
|
||||
@@ -30,11 +29,9 @@ MODEL_CONFIGS = [
|
||||
"grayscale", # Converts ATARI frame to 1 Channel Grayscale image
|
||||
"zero_mean", # Changes frame to range from [-1, 1] if true
|
||||
"extra_frameskip", # (int) for number of frames to skip
|
||||
|
||||
"free_log_std", # Documented in ray.rllib.models.Model
|
||||
"channel_major", # Pytorch conv requires images to be channel-major
|
||||
"squash_to_range", # Whether to squash the action output to space range
|
||||
|
||||
"use_lstm", # Whether to wrap the model with a LSTM
|
||||
"max_seq_len", # Max seq len for training the LSTM, defaults to 20
|
||||
"lstm_cell_size", # Size of the LSTM cell
|
||||
@@ -81,8 +78,8 @@ class ModelCatalog(object):
|
||||
if dist_type is None:
|
||||
dist = DiagGaussian
|
||||
if config.get("squash_to_range"):
|
||||
dist = squash_to_range(
|
||||
dist, action_space.low, action_space.high)
|
||||
dist = squash_to_range(dist, action_space.low,
|
||||
action_space.high)
|
||||
return dist, action_space.shape[0] * 2
|
||||
elif dist_type == 'deterministic':
|
||||
return Deterministic, action_space.shape[0]
|
||||
@@ -95,12 +92,13 @@ class ModelCatalog(object):
|
||||
dist, action_size = ModelCatalog.get_action_dist(action)
|
||||
child_dist.append(dist)
|
||||
size += action_size
|
||||
return partial(MultiActionDistribution,
|
||||
child_distributions=child_dist,
|
||||
action_space=action_space), size
|
||||
return partial(
|
||||
MultiActionDistribution,
|
||||
child_distributions=child_dist,
|
||||
action_space=action_space), size
|
||||
|
||||
raise NotImplementedError(
|
||||
"Unsupported args: {} {}".format(action_space, dist_type))
|
||||
raise NotImplementedError("Unsupported args: {} {}".format(
|
||||
action_space, dist_type))
|
||||
|
||||
@staticmethod
|
||||
def get_action_placeholder(action_space):
|
||||
@@ -120,7 +118,7 @@ class ModelCatalog(object):
|
||||
return tf.placeholder(
|
||||
tf.float32, shape=(None, action_space.shape[0]), name="action")
|
||||
elif isinstance(action_space, gym.spaces.Discrete):
|
||||
return tf.placeholder(tf.int64, shape=(None,), name="action")
|
||||
return tf.placeholder(tf.int64, shape=(None, ), name="action")
|
||||
elif isinstance(action_space, gym.spaces.Tuple):
|
||||
size = 0
|
||||
all_discrete = True
|
||||
@@ -131,15 +129,19 @@ class ModelCatalog(object):
|
||||
all_discrete = False
|
||||
size += np.product(action_space.spaces[i].shape)
|
||||
return tf.placeholder(
|
||||
tf.int64 if all_discrete else tf.float32, shape=(None, size),
|
||||
tf.int64 if all_discrete else tf.float32,
|
||||
shape=(None, size),
|
||||
name="action")
|
||||
else:
|
||||
raise NotImplementedError("action space {}"
|
||||
" not supported".format(action_space))
|
||||
|
||||
@staticmethod
|
||||
def get_model(
|
||||
inputs, num_outputs, options=None, state_in=None, seq_lens=None):
|
||||
def get_model(inputs,
|
||||
num_outputs,
|
||||
options=None,
|
||||
state_in=None,
|
||||
seq_lens=None):
|
||||
"""Returns a suitable model conforming to given input and output specs.
|
||||
|
||||
Args:
|
||||
@@ -154,12 +156,12 @@ class ModelCatalog(object):
|
||||
"""
|
||||
|
||||
options = options or {}
|
||||
model = ModelCatalog._get_model(
|
||||
inputs, num_outputs, options, state_in, seq_lens)
|
||||
model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
|
||||
seq_lens)
|
||||
|
||||
if options.get("use_lstm"):
|
||||
model = LSTM(
|
||||
model.last_layer, num_outputs, options, state_in, seq_lens)
|
||||
model = LSTM(model.last_layer, num_outputs, options, state_in,
|
||||
seq_lens)
|
||||
|
||||
return model
|
||||
|
||||
@@ -169,16 +171,20 @@ class ModelCatalog(object):
|
||||
model = options["custom_model"]
|
||||
print("Using custom model {}".format(model))
|
||||
return _global_registry.get(RLLIB_MODEL, model)(
|
||||
inputs, num_outputs, options,
|
||||
state_in=state_in, seq_lens=seq_lens)
|
||||
inputs,
|
||||
num_outputs,
|
||||
options,
|
||||
state_in=state_in,
|
||||
seq_lens=seq_lens)
|
||||
|
||||
obs_rank = len(inputs.shape) - 1
|
||||
|
||||
# num_outputs > 1 used to avoid hitting this with the value function
|
||||
if isinstance(options.get("custom_options", {}).get(
|
||||
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
|
||||
return MultiAgentFullyConnectedNetwork(
|
||||
inputs, num_outputs, options)
|
||||
if isinstance(
|
||||
options.get("custom_options", {}).get(
|
||||
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
|
||||
return MultiAgentFullyConnectedNetwork(inputs, num_outputs,
|
||||
options)
|
||||
|
||||
if obs_rank > 1:
|
||||
return VisionNetwork(inputs, num_outputs, options)
|
||||
@@ -198,10 +204,10 @@ class ModelCatalog(object):
|
||||
Returns:
|
||||
model (Model): Neural network model.
|
||||
"""
|
||||
from ray.rllib.models.pytorch.fcnet import (
|
||||
FullyConnectedNetwork as PyTorchFCNet)
|
||||
from ray.rllib.models.pytorch.visionnet import (
|
||||
VisionNetwork as PyTorchVisionNet)
|
||||
from ray.rllib.models.pytorch.fcnet import (FullyConnectedNetwork as
|
||||
PyTorchFCNet)
|
||||
from ray.rllib.models.pytorch.visionnet import (VisionNetwork as
|
||||
PyTorchVisionNet)
|
||||
|
||||
if "custom_model" in options:
|
||||
model = options["custom_model"]
|
||||
@@ -232,9 +238,8 @@ class ModelCatalog(object):
|
||||
"""
|
||||
for k in options.keys():
|
||||
if k not in MODEL_CONFIGS:
|
||||
raise Exception(
|
||||
"Unknown config key `{}`, all keys: {}".format(
|
||||
k, MODEL_CONFIGS))
|
||||
raise Exception("Unknown config key `{}`, all keys: {}".format(
|
||||
k, MODEL_CONFIGS))
|
||||
|
||||
if "custom_preprocessor" in options:
|
||||
preprocessor = options["custom_preprocessor"]
|
||||
@@ -271,8 +276,8 @@ class ModelCatalog(object):
|
||||
preprocessor_name (str): Name to register the preprocessor under.
|
||||
preprocessor_class (type): Python class of the preprocessor.
|
||||
"""
|
||||
_global_registry.register(
|
||||
RLLIB_PREPROCESSOR, preprocessor_name, preprocessor_class)
|
||||
_global_registry.register(RLLIB_PREPROCESSOR, preprocessor_name,
|
||||
preprocessor_class)
|
||||
|
||||
@staticmethod
|
||||
def register_custom_model(model_name, model_class):
|
||||
|
||||
@@ -22,14 +22,17 @@ class FullyConnectedNetwork(Model):
|
||||
for size in hiddens:
|
||||
label = "fc{}".format(i)
|
||||
last_layer = slim.fully_connected(
|
||||
last_layer, size,
|
||||
last_layer,
|
||||
size,
|
||||
weights_initializer=normc_initializer(1.0),
|
||||
activation_fn=activation,
|
||||
scope=label)
|
||||
i += 1
|
||||
label = "fc_out"
|
||||
output = slim.fully_connected(
|
||||
last_layer, num_outputs,
|
||||
last_layer,
|
||||
num_outputs,
|
||||
weights_initializer=normc_initializer(0.01),
|
||||
activation_fn=None, scope=label)
|
||||
activation_fn=None,
|
||||
scope=label)
|
||||
return output, last_layer
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
"""LSTM support for RLlib.
|
||||
|
||||
The main trick here is that we add the time dimension at the last moment.
|
||||
@@ -14,7 +13,6 @@ See the add_time_dimension() and chop_into_sequences() functions below for
|
||||
more info.
|
||||
"""
|
||||
|
||||
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.rnn as rnn
|
||||
@@ -46,14 +44,13 @@ def add_time_dimension(padded_inputs, seq_lens):
|
||||
|
||||
# Dynamically reshape the padded batch to introduce a time dimension.
|
||||
new_batch_size = padded_batch_size // max_seq_len
|
||||
new_shape = (
|
||||
[new_batch_size, max_seq_len] +
|
||||
padded_inputs.get_shape().as_list()[1:])
|
||||
new_shape = ([new_batch_size, max_seq_len] +
|
||||
padded_inputs.get_shape().as_list()[1:])
|
||||
return tf.reshape(padded_inputs, new_shape)
|
||||
|
||||
|
||||
def chop_into_sequences(
|
||||
time_column, feature_columns, state_columns, max_seq_len):
|
||||
def chop_into_sequences(time_column, feature_columns, state_columns,
|
||||
max_seq_len):
|
||||
"""Truncate and pad experiences into fixed-length sequences.
|
||||
|
||||
Arguments:
|
||||
@@ -106,7 +103,7 @@ def chop_into_sequences(
|
||||
feature_sequences = []
|
||||
for f in feature_columns:
|
||||
f = np.array(f)
|
||||
f_pad = np.zeros((len(seq_lens) * max_seq_len,) + np.shape(f)[1:])
|
||||
f_pad = np.zeros((len(seq_lens) * max_seq_len, ) + np.shape(f)[1:])
|
||||
seq_base = 0
|
||||
i = 0
|
||||
for l in seq_lens:
|
||||
@@ -152,7 +149,8 @@ class LSTM(Model):
|
||||
lstm = rnn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
|
||||
self.state_init = [
|
||||
np.zeros(lstm.state_size.c, np.float32),
|
||||
np.zeros(lstm.state_size.h, np.float32)]
|
||||
np.zeros(lstm.state_size.h, np.float32)
|
||||
]
|
||||
|
||||
# Setup LSTM inputs
|
||||
if self.state_in:
|
||||
@@ -170,12 +168,15 @@ class LSTM(Model):
|
||||
else:
|
||||
state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in)
|
||||
lstm_out, lstm_state = tf.nn.dynamic_rnn(
|
||||
lstm, last_layer, initial_state=state_in,
|
||||
sequence_length=self.seq_lens, time_major=False)
|
||||
lstm,
|
||||
last_layer,
|
||||
initial_state=state_in,
|
||||
sequence_length=self.seq_lens,
|
||||
time_major=False)
|
||||
self.state_out = list(lstm_state)
|
||||
|
||||
# Compute outputs
|
||||
last_layer = tf.reshape(lstm_out, [-1, cell_size])
|
||||
logits = linear(
|
||||
last_layer, num_outputs, "action", normc_initializer(0.01))
|
||||
logits = linear(last_layer, num_outputs, "action",
|
||||
normc_initializer(0.01))
|
||||
return logits, last_layer
|
||||
|
||||
@@ -11,6 +11,7 @@ def normc_initializer(std=1.0):
|
||||
out = np.random.randn(*shape).astype(np.float32)
|
||||
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
|
||||
return tf.constant(out)
|
||||
|
||||
return _initializer
|
||||
|
||||
|
||||
@@ -18,12 +19,20 @@ def get_activation_fn(name):
|
||||
return getattr(tf.nn, name)
|
||||
|
||||
|
||||
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
||||
dtype=tf.float32, collections=None):
|
||||
def conv2d(x,
|
||||
num_filters,
|
||||
name,
|
||||
filter_size=(3, 3),
|
||||
stride=(1, 1),
|
||||
pad="SAME",
|
||||
dtype=tf.float32,
|
||||
collections=None):
|
||||
with tf.variable_scope(name):
|
||||
stride_shape = [1, stride[0], stride[1], 1]
|
||||
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
|
||||
num_filters]
|
||||
filter_shape = [
|
||||
filter_size[0], filter_size[1],
|
||||
int(x.get_shape()[3]), num_filters
|
||||
]
|
||||
|
||||
# There are "num input feature maps * filter height * filter width"
|
||||
# inputs to each hidden unit.
|
||||
@@ -34,20 +43,24 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
|
||||
# Initialize weights with random weights.
|
||||
w_bound = np.sqrt(6 / (fan_in + fan_out))
|
||||
|
||||
w = tf.get_variable("W", filter_shape, dtype,
|
||||
tf.random_uniform_initializer(-w_bound, w_bound),
|
||||
collections=collections)
|
||||
b = tf.get_variable("b", [1, 1, 1, num_filters],
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
collections=collections)
|
||||
w = tf.get_variable(
|
||||
"W",
|
||||
filter_shape,
|
||||
dtype,
|
||||
tf.random_uniform_initializer(-w_bound, w_bound),
|
||||
collections=collections)
|
||||
b = tf.get_variable(
|
||||
"b", [1, 1, 1, num_filters],
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
collections=collections)
|
||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
|
||||
|
||||
def linear(x, size, name, initializer=None, bias_init=0):
|
||||
w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
|
||||
initializer=initializer)
|
||||
b = tf.get_variable(name + "/b", [size],
|
||||
initializer=tf.constant_initializer(bias_init))
|
||||
w = tf.get_variable(
|
||||
name + "/w", [x.get_shape()[1], size], initializer=initializer)
|
||||
b = tf.get_variable(
|
||||
name + "/b", [size], initializer=tf.constant_initializer(bias_init))
|
||||
return tf.matmul(x, w) + b
|
||||
|
||||
|
||||
|
||||
@@ -37,8 +37,12 @@ class Model(object):
|
||||
a scale parameter (like a standard deviation).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, inputs, num_outputs, options, state_in=None, seq_lens=None):
|
||||
def __init__(self,
|
||||
inputs,
|
||||
num_outputs,
|
||||
options,
|
||||
state_in=None,
|
||||
seq_lens=None):
|
||||
self.inputs = inputs
|
||||
|
||||
# Default attribute values for the non-RNN case
|
||||
@@ -57,8 +61,10 @@ class Model(object):
|
||||
self.outputs, self.last_layer = self._build_layers(
|
||||
inputs, num_outputs, options)
|
||||
if options.get("free_log_std", False):
|
||||
log_std = tf.get_variable(name="log_std", shape=[num_outputs],
|
||||
initializer=tf.zeros_initializer)
|
||||
log_std = tf.get_variable(
|
||||
name="log_std",
|
||||
shape=[num_outputs],
|
||||
initializer=tf.zeros_initializer)
|
||||
self.outputs = tf.concat(
|
||||
[self.outputs, 0.0 * self.outputs + log_std], 1)
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ class MultiAgentFullyConnectedNetwork(Model):
|
||||
|
||||
custom_options = options["custom_options"]
|
||||
hiddens = custom_options.get("multiagent_fcnet_hiddens",
|
||||
[[256, 256]]*1)
|
||||
[[256, 256]] * 1)
|
||||
|
||||
# check for a shared model
|
||||
shared_model = custom_options.get("multiagent_shared_model", 0)
|
||||
@@ -35,8 +35,8 @@ class MultiAgentFullyConnectedNetwork(Model):
|
||||
sub_options = options.copy()
|
||||
sub_options.update({"fcnet_hiddens": hiddens[i]})
|
||||
# TODO(ev) make this support arbitrary networks
|
||||
fcnet = FullyConnectedNetwork(
|
||||
split_inputs[i], int(num_actions[i]), sub_options)
|
||||
fcnet = FullyConnectedNetwork(split_inputs[i],
|
||||
int(num_actions[i]), sub_options)
|
||||
output = fcnet.outputs
|
||||
outputs.append(output)
|
||||
overall_output = tf.concat(outputs, axis=1)
|
||||
|
||||
@@ -6,7 +6,7 @@ import numpy as np
|
||||
import gym
|
||||
|
||||
ATARI_OBS_SHAPE = (210, 160, 3)
|
||||
ATARI_RAM_OBS_SHAPE = (128,)
|
||||
ATARI_RAM_OBS_SHAPE = (128, )
|
||||
|
||||
|
||||
class Preprocessor(object):
|
||||
@@ -70,7 +70,7 @@ class AtariPixelPreprocessor(Preprocessor):
|
||||
|
||||
class AtariRamPreprocessor(Preprocessor):
|
||||
def _init(self):
|
||||
self.shape = (128,)
|
||||
self.shape = (128, )
|
||||
|
||||
def transform(self, observation):
|
||||
return (observation - 128) / 128
|
||||
@@ -78,7 +78,7 @@ class AtariRamPreprocessor(Preprocessor):
|
||||
|
||||
class OneHotPreprocessor(Preprocessor):
|
||||
def _init(self):
|
||||
self.shape = (self._obs_space.n,)
|
||||
self.shape = (self._obs_space.n, )
|
||||
|
||||
def transform(self, observation):
|
||||
arr = np.zeros(self._obs_space.n)
|
||||
@@ -111,13 +111,14 @@ class TupleFlatteningPreprocessor(Preprocessor):
|
||||
preprocessor = get_preprocessor(space)(space, self._options)
|
||||
self.preprocessors.append(preprocessor)
|
||||
size += np.product(preprocessor.shape)
|
||||
self.shape = (size,)
|
||||
self.shape = (size, )
|
||||
|
||||
def transform(self, observation):
|
||||
assert len(observation) == len(self.preprocessors), observation
|
||||
return np.concatenate([
|
||||
np.reshape(p.transform(o), [np.product(p.shape)])
|
||||
for (o, p) in zip(observation, self.preprocessors)])
|
||||
for (o, p) in zip(observation, self.preprocessors)
|
||||
])
|
||||
|
||||
|
||||
def get_preprocessor(space):
|
||||
|
||||
@@ -22,14 +22,27 @@ class VisionNetwork(Model):
|
||||
with tf.name_scope("vision_net"):
|
||||
for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
|
||||
inputs = slim.conv2d(
|
||||
inputs, out_size, kernel, stride,
|
||||
activation_fn=activation, scope="conv{}".format(i))
|
||||
inputs,
|
||||
out_size,
|
||||
kernel,
|
||||
stride,
|
||||
activation_fn=activation,
|
||||
scope="conv{}".format(i))
|
||||
out_size, kernel, stride = filters[-1]
|
||||
fc1 = slim.conv2d(
|
||||
inputs, out_size, kernel, stride,
|
||||
activation_fn=activation, padding="VALID", scope="fc1")
|
||||
fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None,
|
||||
normalizer_fn=None, scope="fc2")
|
||||
inputs,
|
||||
out_size,
|
||||
kernel,
|
||||
stride,
|
||||
activation_fn=activation,
|
||||
padding="VALID",
|
||||
scope="fc1")
|
||||
fc2 = slim.conv2d(
|
||||
fc1,
|
||||
num_outputs, [1, 1],
|
||||
activation_fn=None,
|
||||
normalizer_fn=None,
|
||||
scope="fc2")
|
||||
return flatten(fc2), flatten(fc1)
|
||||
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ from ray.rllib.optimizers.sync_samples_optimizer import SyncSamplesOptimizer
|
||||
from ray.rllib.optimizers.sync_replay_optimizer import SyncReplayOptimizer
|
||||
from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer
|
||||
|
||||
|
||||
__all__ = [
|
||||
"PolicyOptimizer", "AsyncSamplesOptimizer", "AsyncGradientsOptimizer",
|
||||
"SyncSamplesOptimizer", "SyncReplayOptimizer", "LocalMultiGPUOptimizer"
|
||||
|
||||
@@ -14,6 +14,7 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
|
||||
evaluators, sending updated weights back as needed. This pipelines the
|
||||
gradient computations on the remote workers.
|
||||
"""
|
||||
|
||||
def _init(self, grads_per_step=100):
|
||||
self.apply_timer = TimerStat()
|
||||
self.wait_timer = TimerStat()
|
||||
@@ -55,8 +56,9 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
|
||||
num_gradients += 1
|
||||
|
||||
def stats(self):
|
||||
return dict(PolicyOptimizer.stats(self), **{
|
||||
"wait_time_ms": round(1000 * self.wait_timer.mean, 3),
|
||||
"apply_time_ms": round(1000 * self.apply_timer.mean, 3),
|
||||
"dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
|
||||
})
|
||||
return dict(
|
||||
PolicyOptimizer.stats(self), **{
|
||||
"wait_time_ms": round(1000 * self.wait_timer.mean, 3),
|
||||
"apply_time_ms": round(1000 * self.apply_timer.mean, 3),
|
||||
"dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
|
||||
})
|
||||
|
||||
@@ -22,7 +22,6 @@ from ray.rllib.utils.actors import TaskPool, create_colocated
|
||||
from ray.rllib.utils.timer import TimerStat
|
||||
from ray.rllib.utils.window_stat import WindowStat
|
||||
|
||||
|
||||
SAMPLE_QUEUE_DEPTH = 2
|
||||
REPLAY_QUEUE_DEPTH = 4
|
||||
LEARNER_QUEUE_MAX_SIZE = 16
|
||||
@@ -35,10 +34,10 @@ class ReplayActor(object):
|
||||
Ray actors are single-threaded, so for scalability multiple replay actors
|
||||
may be created to increase parallelism."""
|
||||
|
||||
def __init__(
|
||||
self, num_shards, learning_starts, buffer_size, train_batch_size,
|
||||
prioritized_replay_alpha, prioritized_replay_beta,
|
||||
prioritized_replay_eps, clip_rewards):
|
||||
def __init__(self, num_shards, learning_starts, buffer_size,
|
||||
train_batch_size, prioritized_replay_alpha,
|
||||
prioritized_replay_beta, prioritized_replay_eps,
|
||||
clip_rewards):
|
||||
self.replay_starts = learning_starts // num_shards
|
||||
self.buffer_size = buffer_size // num_shards
|
||||
self.train_batch_size = train_batch_size
|
||||
@@ -46,7 +45,8 @@ class ReplayActor(object):
|
||||
self.prioritized_replay_eps = prioritized_replay_eps
|
||||
|
||||
self.replay_buffer = PrioritizedReplayBuffer(
|
||||
self.buffer_size, alpha=prioritized_replay_alpha,
|
||||
self.buffer_size,
|
||||
alpha=prioritized_replay_alpha,
|
||||
clip_rewards=clip_rewards)
|
||||
|
||||
# Metrics
|
||||
@@ -60,38 +60,39 @@ class ReplayActor(object):
|
||||
def add_batch(self, batch):
|
||||
with self.add_batch_timer:
|
||||
for row in batch.rows():
|
||||
self.replay_buffer.add(
|
||||
row["obs"], row["actions"], row["rewards"], row["new_obs"],
|
||||
row["dones"], row["weights"])
|
||||
self.replay_buffer.add(row["obs"], row["actions"],
|
||||
row["rewards"], row["new_obs"],
|
||||
row["dones"], row["weights"])
|
||||
|
||||
def replay(self):
|
||||
with self.replay_timer:
|
||||
if len(self.replay_buffer) < self.replay_starts:
|
||||
return None
|
||||
|
||||
(obses_t, actions, rewards, obses_tp1,
|
||||
dones, weights, batch_indexes) = self.replay_buffer.sample(
|
||||
self.train_batch_size,
|
||||
beta=self.prioritized_replay_beta)
|
||||
(obses_t, actions, rewards, obses_tp1, dones, weights,
|
||||
batch_indexes) = self.replay_buffer.sample(
|
||||
self.train_batch_size, beta=self.prioritized_replay_beta)
|
||||
|
||||
batch = SampleBatch({
|
||||
"obs": obses_t, "actions": actions, "rewards": rewards,
|
||||
"new_obs": obses_tp1, "dones": dones, "weights": weights,
|
||||
"batch_indexes": batch_indexes})
|
||||
"obs": obses_t,
|
||||
"actions": actions,
|
||||
"rewards": rewards,
|
||||
"new_obs": obses_tp1,
|
||||
"dones": dones,
|
||||
"weights": weights,
|
||||
"batch_indexes": batch_indexes
|
||||
})
|
||||
return batch
|
||||
|
||||
def update_priorities(self, batch_indexes, td_errors):
|
||||
with self.update_priorities_timer:
|
||||
new_priorities = (
|
||||
np.abs(td_errors) + self.prioritized_replay_eps)
|
||||
new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
|
||||
self.replay_buffer.update_priorities(batch_indexes, new_priorities)
|
||||
|
||||
def stats(self):
|
||||
stat = {
|
||||
"add_batch_time_ms": round(
|
||||
1000 * self.add_batch_timer.mean, 3),
|
||||
"replay_time_ms": round(
|
||||
1000 * self.replay_timer.mean, 3),
|
||||
"add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3),
|
||||
"replay_time_ms": round(1000 * self.replay_timer.mean, 3),
|
||||
"update_priorities_time_ms": round(
|
||||
1000 * self.update_priorities_timer.mean, 3),
|
||||
}
|
||||
@@ -145,13 +146,19 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
|
||||
"td_error" array in the info return of compute_gradients(). This error
|
||||
term will be used for sample prioritization."""
|
||||
|
||||
def _init(
|
||||
self, learning_starts=1000, buffer_size=10000,
|
||||
prioritized_replay=True, prioritized_replay_alpha=0.6,
|
||||
prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
|
||||
train_batch_size=512, sample_batch_size=50,
|
||||
num_replay_buffer_shards=1, max_weight_sync_delay=400,
|
||||
clip_rewards=True, debug=False):
|
||||
def _init(self,
|
||||
learning_starts=1000,
|
||||
buffer_size=10000,
|
||||
prioritized_replay=True,
|
||||
prioritized_replay_alpha=0.6,
|
||||
prioritized_replay_beta=0.4,
|
||||
prioritized_replay_eps=1e-6,
|
||||
train_batch_size=512,
|
||||
sample_batch_size=50,
|
||||
num_replay_buffer_shards=1,
|
||||
max_weight_sync_delay=400,
|
||||
clip_rewards=True,
|
||||
debug=False):
|
||||
|
||||
self.debug = debug
|
||||
self.replay_starts = learning_starts
|
||||
@@ -164,18 +171,21 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
|
||||
self.learner = LearnerThread(self.local_evaluator)
|
||||
self.learner.start()
|
||||
|
||||
self.replay_actors = create_colocated(
|
||||
ReplayActor,
|
||||
[num_replay_buffer_shards, learning_starts, buffer_size,
|
||||
train_batch_size, prioritized_replay_alpha,
|
||||
prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
|
||||
num_replay_buffer_shards)
|
||||
self.replay_actors = create_colocated(ReplayActor, [
|
||||
num_replay_buffer_shards, learning_starts, buffer_size,
|
||||
train_batch_size, prioritized_replay_alpha,
|
||||
prioritized_replay_beta, prioritized_replay_eps, clip_rewards
|
||||
], num_replay_buffer_shards)
|
||||
assert len(self.remote_evaluators) > 0
|
||||
|
||||
# Stats
|
||||
self.timers = {k: TimerStat() for k in [
|
||||
"put_weights", "get_samples", "enqueue", "sample_processing",
|
||||
"replay_processing", "update_priorities", "train", "sample"]}
|
||||
self.timers = {
|
||||
k: TimerStat()
|
||||
for k in [
|
||||
"put_weights", "get_samples", "enqueue", "sample_processing",
|
||||
"replay_processing", "update_priorities", "train", "sample"
|
||||
]
|
||||
}
|
||||
self.num_weight_syncs = 0
|
||||
self.learning_started = False
|
||||
|
||||
@@ -221,8 +231,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
|
||||
sample_timesteps += self.sample_batch_size
|
||||
|
||||
# Send the data to the replay buffer
|
||||
random.choice(self.replay_actors).add_batch.remote(
|
||||
sample_batch)
|
||||
random.choice(
|
||||
self.replay_actors).add_batch.remote(sample_batch)
|
||||
|
||||
# Update weights if needed
|
||||
self.steps_since_update[ev] += self.sample_batch_size
|
||||
@@ -268,8 +278,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
|
||||
timing["learner_dequeue_time_ms"] = round(
|
||||
1000 * self.learner.queue_timer.mean, 3)
|
||||
stats = {
|
||||
"sample_throughput": round(
|
||||
self.timers["sample"].mean_throughput, 3),
|
||||
"sample_throughput": round(self.timers["sample"].mean_throughput,
|
||||
3),
|
||||
"train_throughput": round(self.timers["train"].mean_throughput, 3),
|
||||
"num_weight_syncs": self.num_weight_syncs,
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@ from collections import namedtuple
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
|
||||
# Variable scope in which created variables will be placed under
|
||||
TOWER_SCOPE_NAME = "tower"
|
||||
|
||||
@@ -47,8 +46,14 @@ class LocalSyncParallelOptimizer(object):
|
||||
grad_norm_clipping: None or int stdev to clip grad norms by
|
||||
"""
|
||||
|
||||
def __init__(self, optimizer, devices, input_placeholders, rnn_inputs,
|
||||
per_device_batch_size, build_graph, logdir,
|
||||
def __init__(self,
|
||||
optimizer,
|
||||
devices,
|
||||
input_placeholders,
|
||||
rnn_inputs,
|
||||
per_device_batch_size,
|
||||
build_graph,
|
||||
logdir,
|
||||
grad_norm_clipping=None):
|
||||
# TODO(rliaw): remove logdir
|
||||
self.optimizer = optimizer
|
||||
@@ -78,8 +83,8 @@ class LocalSyncParallelOptimizer(object):
|
||||
self._towers = []
|
||||
for device, device_placeholders in zip(self.devices, data_splits):
|
||||
self._towers.append(
|
||||
self._setup_device(
|
||||
device, device_placeholders, len(input_placeholders)))
|
||||
self._setup_device(device, device_placeholders,
|
||||
len(input_placeholders)))
|
||||
|
||||
avg = average_gradients([t.grads for t in self._towers])
|
||||
if grad_norm_clipping:
|
||||
@@ -119,14 +124,10 @@ class LocalSyncParallelOptimizer(object):
|
||||
assert len(state_inputs[0]) * seq_len == len(inputs[0])
|
||||
# Make sure the shorter state inputs arrays are evenly divisible
|
||||
state_inputs = [
|
||||
make_divisible_by(arr, self.batch_size)
|
||||
for arr in state_inputs
|
||||
make_divisible_by(arr, self.batch_size) for arr in state_inputs
|
||||
]
|
||||
# Then truncate the data inputs to match
|
||||
inputs = [
|
||||
arr[:len(state_inputs[0]) * seq_len]
|
||||
for arr in inputs
|
||||
]
|
||||
inputs = [arr[:len(state_inputs[0]) * seq_len] for arr in inputs]
|
||||
assert len(state_inputs[0]) * seq_len == len(inputs[0])
|
||||
assert len(state_inputs[0]) % self.batch_size == 0
|
||||
for ph, arr in zip(self.loss_inputs, inputs + state_inputs):
|
||||
@@ -138,8 +139,7 @@ class LocalSyncParallelOptimizer(object):
|
||||
feed_dict[ph] = truncated_arr
|
||||
truncated_len = len(truncated_arr)
|
||||
|
||||
sess.run(
|
||||
[t.init_op for t in self._towers], feed_dict=feed_dict)
|
||||
sess.run([t.init_op for t in self._towers], feed_dict=feed_dict)
|
||||
|
||||
tuples_per_device = truncated_len / len(self.devices)
|
||||
assert tuples_per_device > 0, \
|
||||
@@ -198,7 +198,9 @@ class LocalSyncParallelOptimizer(object):
|
||||
device_input_slices = []
|
||||
for i, ph in enumerate(device_input_placeholders):
|
||||
current_batch = tf.Variable(
|
||||
ph, trainable=False, validate_shape=False,
|
||||
ph,
|
||||
trainable=False,
|
||||
validate_shape=False,
|
||||
collections=[])
|
||||
device_input_batches.append(current_batch)
|
||||
if i < num_data_in:
|
||||
@@ -210,18 +212,17 @@ class LocalSyncParallelOptimizer(object):
|
||||
current_slice = tf.slice(
|
||||
current_batch,
|
||||
([self._batch_index // scale * granularity] +
|
||||
[0] * len(ph.shape[1:])),
|
||||
[0] * len(ph.shape[1:])),
|
||||
([self.per_device_batch_size // scale * granularity] +
|
||||
[-1] * len(ph.shape[1:])))
|
||||
[-1] * len(ph.shape[1:])))
|
||||
current_slice.set_shape(ph.shape)
|
||||
device_input_slices.append(current_slice)
|
||||
graph_obj = self.build_graph(device_input_slices)
|
||||
device_grads = graph_obj.gradients(self.optimizer)
|
||||
return Tower(
|
||||
tf.group(*[batch.initializer
|
||||
for batch in device_input_batches]),
|
||||
device_grads,
|
||||
graph_obj)
|
||||
tf.group(
|
||||
*[batch.initializer for batch in device_input_batches]),
|
||||
device_grads, graph_obj)
|
||||
|
||||
|
||||
# Each tower is a copy of the loss graph pinned to a specific device.
|
||||
|
||||
@@ -30,8 +30,12 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
may result in unexpected behavior.
|
||||
"""
|
||||
|
||||
def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10,
|
||||
timesteps_per_batch=1024, standardize_fields=[]):
|
||||
def _init(self,
|
||||
sgd_batch_size=128,
|
||||
sgd_stepsize=5e-5,
|
||||
num_sgd_iter=10,
|
||||
timesteps_per_batch=1024,
|
||||
standardize_fields=[]):
|
||||
self.batch_size = sgd_batch_size
|
||||
self.sgd_stepsize = sgd_stepsize
|
||||
self.num_sgd_iter = num_sgd_iter
|
||||
@@ -41,8 +45,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
self.devices = ["/cpu:0"]
|
||||
else:
|
||||
self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
|
||||
self.batch_size = int(
|
||||
sgd_batch_size / len(self.devices)) * len(self.devices)
|
||||
self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
|
||||
self.devices)
|
||||
assert self.batch_size % len(self.devices) == 0
|
||||
assert self.batch_size >= len(self.devices), "batch size too small"
|
||||
self.per_device_batch_size = int(self.batch_size / len(self.devices))
|
||||
@@ -70,16 +74,15 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
with tf.variable_scope("default", reuse=tf.AUTO_REUSE):
|
||||
if self.policy._state_inputs:
|
||||
rnn_inputs = self.policy._state_inputs + [
|
||||
self.policy._seq_lens]
|
||||
self.policy._seq_lens
|
||||
]
|
||||
else:
|
||||
rnn_inputs = []
|
||||
self.par_opt = LocalSyncParallelOptimizer(
|
||||
tf.train.AdamOptimizer(self.sgd_stepsize),
|
||||
self.devices,
|
||||
[v for _, v in self.policy.loss_inputs()],
|
||||
rnn_inputs,
|
||||
self.per_device_batch_size,
|
||||
self.policy.copy,
|
||||
tf.train.AdamOptimizer(
|
||||
self.sgd_stepsize), self.devices,
|
||||
[v for _, v in self.policy.loss_inputs()], rnn_inputs,
|
||||
self.per_device_batch_size, self.policy.copy,
|
||||
os.getcwd())
|
||||
|
||||
self.sess = self.local_evaluator.tf_sess
|
||||
@@ -117,8 +120,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
else:
|
||||
state_keys = []
|
||||
tuples_per_device = self.par_opt.load_data(
|
||||
self.sess,
|
||||
[tuples[k] for k in data_keys],
|
||||
self.sess, [tuples[k] for k in data_keys],
|
||||
[tuples[k] for k in state_keys])
|
||||
|
||||
with self.grad_timer:
|
||||
@@ -141,12 +143,14 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
return _averaged(iter_extra_fetches)
|
||||
|
||||
def stats(self):
|
||||
return dict(PolicyOptimizer.stats(self), **{
|
||||
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
|
||||
"load_time_ms": round(1000 * self.load_timer.mean, 3),
|
||||
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
|
||||
"update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
|
||||
})
|
||||
return dict(
|
||||
PolicyOptimizer.stats(self), **{
|
||||
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
|
||||
"load_time_ms": round(1000 * self.load_timer.mean, 3),
|
||||
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
|
||||
"update_time_ms": round(1000 * self.update_weights_timer.mean,
|
||||
3),
|
||||
})
|
||||
|
||||
|
||||
def _averaged(kv):
|
||||
|
||||
@@ -103,9 +103,10 @@ class PolicyOptimizer(object):
|
||||
"""
|
||||
|
||||
local_result = [func(self.local_evaluator, 0)]
|
||||
remote_results = ray.get(
|
||||
[ev.apply.remote(func, i + 1)
|
||||
for i, ev in enumerate(self.remote_evaluators)])
|
||||
remote_results = ray.get([
|
||||
ev.apply.remote(func, i + 1)
|
||||
for i, ev in enumerate(self.remote_evaluators)
|
||||
])
|
||||
return local_result + remote_results
|
||||
|
||||
def collect_metrics(self):
|
||||
|
||||
@@ -90,8 +90,10 @@ class ReplayBuffer(object):
|
||||
done_mask[i] = 1 if executing act_batch[i] resulted in
|
||||
the end of an episode and 0 otherwise.
|
||||
"""
|
||||
idxes = [random.randint(0, len(self._storage) - 1)
|
||||
for _ in range(batch_size)]
|
||||
idxes = [
|
||||
random.randint(0,
|
||||
len(self._storage) - 1) for _ in range(batch_size)
|
||||
]
|
||||
self._num_sampled += batch_size
|
||||
return self._encode_sample(idxes)
|
||||
|
||||
@@ -142,12 +144,12 @@ class PrioritizedReplayBuffer(ReplayBuffer):
|
||||
reward = np.sign(reward)
|
||||
|
||||
idx = self._next_idx
|
||||
super(PrioritizedReplayBuffer, self).add(
|
||||
obs_t, action, reward, obs_tp1, done, weight)
|
||||
super(PrioritizedReplayBuffer, self).add(obs_t, action, reward,
|
||||
obs_tp1, done, weight)
|
||||
if weight is None:
|
||||
weight = self._max_priority
|
||||
self._it_sum[idx] = weight ** self._alpha
|
||||
self._it_min[idx] = weight ** self._alpha
|
||||
self._it_sum[idx] = weight**self._alpha
|
||||
self._it_min[idx] = weight**self._alpha
|
||||
|
||||
def _sample_proportional(self, batch_size):
|
||||
res = []
|
||||
@@ -202,11 +204,11 @@ class PrioritizedReplayBuffer(ReplayBuffer):
|
||||
|
||||
weights = []
|
||||
p_min = self._it_min.min() / self._it_sum.sum()
|
||||
max_weight = (p_min * len(self._storage)) ** (-beta)
|
||||
max_weight = (p_min * len(self._storage))**(-beta)
|
||||
|
||||
for idx in idxes:
|
||||
p_sample = self._it_sum[idx] / self._it_sum.sum()
|
||||
weight = (p_sample * len(self._storage)) ** (-beta)
|
||||
weight = (p_sample * len(self._storage))**(-beta)
|
||||
weights.append(weight / max_weight)
|
||||
weights = np.array(weights)
|
||||
encoded_sample = self._encode_sample(idxes)
|
||||
@@ -231,10 +233,10 @@ class PrioritizedReplayBuffer(ReplayBuffer):
|
||||
for idx, priority in zip(idxes, priorities):
|
||||
assert priority > 0
|
||||
assert 0 <= idx < len(self._storage)
|
||||
delta = priority ** self._alpha - self._it_sum[idx]
|
||||
delta = priority**self._alpha - self._it_sum[idx]
|
||||
self._prio_change_stats.push(delta)
|
||||
self._it_sum[idx] = priority ** self._alpha
|
||||
self._it_min[idx] = priority ** self._alpha
|
||||
self._it_sum[idx] = priority**self._alpha
|
||||
self._it_min[idx] = priority**self._alpha
|
||||
|
||||
self._max_priority = max(self._max_priority, priority)
|
||||
|
||||
|
||||
@@ -54,8 +54,7 @@ class SegmentTree(object):
|
||||
return self._operation(
|
||||
self._reduce_helper(start, mid, 2 * node, node_start, mid),
|
||||
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1,
|
||||
node_end)
|
||||
)
|
||||
node_end))
|
||||
|
||||
def reduce(self, start=0, end=None):
|
||||
"""Returns result of applying `self.operation`
|
||||
@@ -89,9 +88,8 @@ class SegmentTree(object):
|
||||
self._value[idx] = val
|
||||
idx //= 2
|
||||
while idx >= 1:
|
||||
self._value[idx] = self._operation(
|
||||
self._value[2 * idx],
|
||||
self._value[2 * idx + 1])
|
||||
self._value[idx] = self._operation(self._value[2 * idx],
|
||||
self._value[2 * idx + 1])
|
||||
idx //= 2
|
||||
|
||||
def __getitem__(self, idx):
|
||||
@@ -102,9 +100,7 @@ class SegmentTree(object):
|
||||
class SumSegmentTree(SegmentTree):
|
||||
def __init__(self, capacity):
|
||||
super(SumSegmentTree, self).__init__(
|
||||
capacity=capacity,
|
||||
operation=operator.add,
|
||||
neutral_element=0.0)
|
||||
capacity=capacity, operation=operator.add, neutral_element=0.0)
|
||||
|
||||
def sum(self, start=0, end=None):
|
||||
"""Returns arr[start] + ... + arr[end]"""
|
||||
@@ -142,9 +138,7 @@ class SumSegmentTree(SegmentTree):
|
||||
class MinSegmentTree(SegmentTree):
|
||||
def __init__(self, capacity):
|
||||
super(MinSegmentTree, self).__init__(
|
||||
capacity=capacity,
|
||||
operation=min,
|
||||
neutral_element=float('inf'))
|
||||
capacity=capacity, operation=min, neutral_element=float('inf'))
|
||||
|
||||
def min(self, start=0, end=None):
|
||||
"""Returns min(arr[start], ..., arr[end])"""
|
||||
|
||||
@@ -23,11 +23,16 @@ class SyncReplayOptimizer(PolicyOptimizer):
|
||||
"td_error" array in the info return of compute_gradients(). This error
|
||||
term will be used for sample prioritization."""
|
||||
|
||||
def _init(
|
||||
self, learning_starts=1000, buffer_size=10000,
|
||||
prioritized_replay=True, prioritized_replay_alpha=0.6,
|
||||
prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
|
||||
train_batch_size=32, sample_batch_size=4, clip_rewards=True):
|
||||
def _init(self,
|
||||
learning_starts=1000,
|
||||
buffer_size=10000,
|
||||
prioritized_replay=True,
|
||||
prioritized_replay_alpha=0.6,
|
||||
prioritized_replay_beta=0.4,
|
||||
prioritized_replay_eps=1e-6,
|
||||
train_batch_size=32,
|
||||
sample_batch_size=4,
|
||||
clip_rewards=True):
|
||||
|
||||
self.replay_starts = learning_starts
|
||||
self.prioritized_replay_beta = prioritized_replay_beta
|
||||
@@ -43,11 +48,14 @@ class SyncReplayOptimizer(PolicyOptimizer):
|
||||
|
||||
# Set up replay buffer
|
||||
if prioritized_replay:
|
||||
|
||||
def new_buffer():
|
||||
return PrioritizedReplayBuffer(
|
||||
buffer_size, alpha=prioritized_replay_alpha,
|
||||
buffer_size,
|
||||
alpha=prioritized_replay_alpha,
|
||||
clip_rewards=clip_rewards)
|
||||
else:
|
||||
|
||||
def new_buffer():
|
||||
return ReplayBuffer(buffer_size, clip_rewards)
|
||||
|
||||
@@ -72,17 +80,19 @@ class SyncReplayOptimizer(PolicyOptimizer):
|
||||
|
||||
# Handle everything as if multiagent
|
||||
if isinstance(batch, SampleBatch):
|
||||
batch = MultiAgentBatch(
|
||||
{DEFAULT_POLICY_ID: batch}, batch.count)
|
||||
batch = MultiAgentBatch({
|
||||
DEFAULT_POLICY_ID: batch
|
||||
}, batch.count)
|
||||
|
||||
for policy_id, s in batch.policy_batches.items():
|
||||
for row in s.rows():
|
||||
if "weights" not in row:
|
||||
row["weights"] = np.ones_like(row["rewards"])
|
||||
self.replay_buffers[policy_id].add(
|
||||
pack_if_needed(row["obs"]), row["actions"],
|
||||
row["rewards"], pack_if_needed(row["new_obs"]),
|
||||
row["dones"], row["weights"])
|
||||
pack_if_needed(row["obs"]),
|
||||
row["actions"], row["rewards"],
|
||||
pack_if_needed(row["new_obs"]), row["dones"],
|
||||
row["weights"])
|
||||
|
||||
if self.num_steps_sampled >= self.replay_starts:
|
||||
self._optimize()
|
||||
@@ -112,27 +122,35 @@ class SyncReplayOptimizer(PolicyOptimizer):
|
||||
with self.replay_timer:
|
||||
for policy_id, replay_buffer in self.replay_buffers.items():
|
||||
if isinstance(replay_buffer, PrioritizedReplayBuffer):
|
||||
(obses_t, actions, rewards, obses_tp1,
|
||||
dones, weights, batch_indexes) = replay_buffer.sample(
|
||||
self.train_batch_size,
|
||||
beta=self.prioritized_replay_beta)
|
||||
(obses_t, actions, rewards, obses_tp1, dones, weights,
|
||||
batch_indexes) = replay_buffer.sample(
|
||||
self.train_batch_size,
|
||||
beta=self.prioritized_replay_beta)
|
||||
else:
|
||||
(obses_t, actions, rewards, obses_tp1,
|
||||
dones) = replay_buffer.sample(self.train_batch_size)
|
||||
dones) = replay_buffer.sample(self.train_batch_size)
|
||||
weights = np.ones_like(rewards)
|
||||
batch_indexes = - np.ones_like(rewards)
|
||||
batch_indexes = -np.ones_like(rewards)
|
||||
samples[policy_id] = SampleBatch({
|
||||
"obs": obses_t, "actions": actions, "rewards": rewards,
|
||||
"new_obs": obses_tp1, "dones": dones, "weights": weights,
|
||||
"batch_indexes": batch_indexes})
|
||||
"obs": obses_t,
|
||||
"actions": actions,
|
||||
"rewards": rewards,
|
||||
"new_obs": obses_tp1,
|
||||
"dones": dones,
|
||||
"weights": weights,
|
||||
"batch_indexes": batch_indexes
|
||||
})
|
||||
return MultiAgentBatch(samples, self.train_batch_size)
|
||||
|
||||
def stats(self):
|
||||
return dict(PolicyOptimizer.stats(self), **{
|
||||
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
|
||||
"replay_time_ms": round(1000 * self.replay_timer.mean, 3),
|
||||
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
|
||||
"update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
|
||||
"opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
|
||||
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
|
||||
})
|
||||
return dict(
|
||||
PolicyOptimizer.stats(self), **{
|
||||
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
|
||||
"replay_time_ms": round(1000 * self.replay_timer.mean, 3),
|
||||
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
|
||||
"update_time_ms": round(1000 * self.update_weights_timer.mean,
|
||||
3),
|
||||
"opt_peak_throughput": round(self.grad_timer.mean_throughput,
|
||||
3),
|
||||
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
|
||||
})
|
||||
|
||||
@@ -51,10 +51,13 @@ class SyncSamplesOptimizer(PolicyOptimizer):
|
||||
return fetches
|
||||
|
||||
def stats(self):
|
||||
return dict(PolicyOptimizer.stats(self), **{
|
||||
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
|
||||
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
|
||||
"update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
|
||||
"opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
|
||||
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
|
||||
})
|
||||
return dict(
|
||||
PolicyOptimizer.stats(self), **{
|
||||
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
|
||||
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
|
||||
"update_time_ms": round(1000 * self.update_weights_timer.mean,
|
||||
3),
|
||||
"opt_peak_throughput": round(self.grad_timer.mean_throughput,
|
||||
3),
|
||||
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
|
||||
})
|
||||
|
||||
+17
-11
@@ -15,7 +15,6 @@ from ray.rllib.agents.agent import get_agent_class
|
||||
from ray.rllib.agents.dqn.common.wrappers import wrap_dqn
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
|
||||
EXAMPLE_USAGE = """
|
||||
Example Usage via RLlib CLI:
|
||||
rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN
|
||||
@@ -32,30 +31,37 @@ def create_parser(parser_creator=None):
|
||||
parser = parser_creator(
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="Roll out a reinforcement learning agent "
|
||||
"given a checkpoint.", epilog=EXAMPLE_USAGE)
|
||||
"given a checkpoint.",
|
||||
epilog=EXAMPLE_USAGE)
|
||||
|
||||
parser.add_argument(
|
||||
"checkpoint", type=str, help="Checkpoint from which to roll out.")
|
||||
required_named = parser.add_argument_group("required named arguments")
|
||||
required_named.add_argument(
|
||||
"--run", type=str, required=True,
|
||||
"--run",
|
||||
type=str,
|
||||
required=True,
|
||||
help="The algorithm or model to train. This may refer to the name "
|
||||
"of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
|
||||
"user-defined trainable function or class registered in the "
|
||||
"tune registry.")
|
||||
"of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
|
||||
"user-defined trainable function or class registered in the "
|
||||
"tune registry.")
|
||||
required_named.add_argument(
|
||||
"--env", type=str, help="The gym environment to use.")
|
||||
parser.add_argument(
|
||||
"--no-render", default=False, action="store_const", const=True,
|
||||
"--no-render",
|
||||
default=False,
|
||||
action="store_const",
|
||||
const=True,
|
||||
help="Surpress rendering of the environment.")
|
||||
parser.add_argument(
|
||||
"--steps", default=None, help="Number of steps to roll out.")
|
||||
parser.add_argument("--out", default=None, help="Output filename.")
|
||||
parser.add_argument(
|
||||
"--out", default=None, help="Output filename.")
|
||||
parser.add_argument(
|
||||
"--config", default="{}", type=json.loads,
|
||||
"--config",
|
||||
default="{}",
|
||||
type=json.loads,
|
||||
help="Algorithm-specific configuration (e.g. env, hyperparams). "
|
||||
"Surpresses loading of configuration from checkpoint.")
|
||||
"Surpresses loading of configuration from checkpoint.")
|
||||
return parser
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,6 @@ import argparse
|
||||
from ray.rllib import train
|
||||
from ray.rllib import rollout
|
||||
|
||||
|
||||
EXAMPLE_USAGE = """
|
||||
Example usage for training:
|
||||
rllib train --run DQN --env CartPole-v0
|
||||
|
||||
@@ -15,16 +15,17 @@ class _MockEvaluator(object):
|
||||
self._sample_count = sample_count
|
||||
self.obs_filter = MeanStdFilter(())
|
||||
self.rew_filter = MeanStdFilter(())
|
||||
self.filters = {"obs_filter": self.obs_filter,
|
||||
"rew_filter": self.rew_filter}
|
||||
self.filters = {
|
||||
"obs_filter": self.obs_filter,
|
||||
"rew_filter": self.rew_filter
|
||||
}
|
||||
|
||||
def sample(self):
|
||||
samples_dict = {"observations": [], "rewards": []}
|
||||
for i in range(self._sample_count):
|
||||
samples_dict["observations"].append(
|
||||
self.obs_filter(np.random.randn()))
|
||||
samples_dict["rewards"].append(
|
||||
self.rew_filter(np.random.randn()))
|
||||
samples_dict["rewards"].append(self.rew_filter(np.random.randn()))
|
||||
return SampleBatch(samples_dict)
|
||||
|
||||
def compute_gradients(self, samples):
|
||||
|
||||
@@ -8,8 +8,8 @@ import ray
|
||||
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.model import Model
|
||||
from ray.rllib.models.preprocessors import (
|
||||
NoPreprocessor, OneHotPreprocessor, Preprocessor)
|
||||
from ray.rllib.models.preprocessors import (NoPreprocessor, OneHotPreprocessor,
|
||||
Preprocessor)
|
||||
from ray.rllib.models.fcnet import FullyConnectedNetwork
|
||||
from ray.rllib.models.visionnet import VisionNetwork
|
||||
|
||||
@@ -44,9 +44,11 @@ class ModelCatalogTest(unittest.TestCase):
|
||||
class TupleEnv(object):
|
||||
def __init__(self):
|
||||
self.observation_space = Tuple(
|
||||
[Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
|
||||
[Discrete(5),
|
||||
Box(0, 1, shape=(3, ), dtype=np.float32)])
|
||||
|
||||
p1 = ModelCatalog.get_preprocessor(TupleEnv())
|
||||
self.assertEqual(p1.shape, (8,))
|
||||
self.assertEqual(p1.shape, (8, ))
|
||||
self.assertEqual(
|
||||
list(p1.transform((0, [1, 2, 3]))),
|
||||
[float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
|
||||
|
||||
@@ -20,12 +20,24 @@ def get_mean_action(alg, obs):
|
||||
ray.init(num_cpus=10)
|
||||
|
||||
CONFIGS = {
|
||||
"ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
|
||||
"num_workers": 2},
|
||||
"ES": {
|
||||
"episodes_per_batch": 10,
|
||||
"timesteps_per_batch": 100,
|
||||
"num_workers": 2
|
||||
},
|
||||
"DQN": {},
|
||||
"DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
|
||||
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
|
||||
"A3C": {"num_workers": 1},
|
||||
"DDPG": {
|
||||
"noise_scale": 0.0,
|
||||
"timesteps_per_iteration": 100
|
||||
},
|
||||
"PPO": {
|
||||
"num_sgd_iter": 5,
|
||||
"timesteps_per_batch": 1000,
|
||||
"num_workers": 2
|
||||
},
|
||||
"A3C": {
|
||||
"num_workers": 1
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from ray.rllib.test.mock_evaluator import _MockEvaluator
|
||||
|
||||
class RunningStatTest(unittest.TestCase):
|
||||
def testRunningStat(self):
|
||||
for shp in ((), (3,), (3, 4)):
|
||||
for shp in ((), (3, ), (3, 4)):
|
||||
li = []
|
||||
rs = RunningStat(shp)
|
||||
for _ in range(5):
|
||||
@@ -22,12 +22,12 @@ class RunningStatTest(unittest.TestCase):
|
||||
li.append(val)
|
||||
m = np.mean(li, axis=0)
|
||||
self.assertTrue(np.allclose(rs.mean, m))
|
||||
v = (np.square(m) if (len(li) == 1)
|
||||
else np.var(li, ddof=1, axis=0))
|
||||
v = (np.square(m)
|
||||
if (len(li) == 1) else np.var(li, ddof=1, axis=0))
|
||||
self.assertTrue(np.allclose(rs.var, v))
|
||||
|
||||
def testCombiningStat(self):
|
||||
for shape in [(), (3,), (3, 4)]:
|
||||
for shape in [(), (3, ), (3, 4)]:
|
||||
li = []
|
||||
rs1 = RunningStat(shape)
|
||||
rs2 = RunningStat(shape)
|
||||
@@ -48,7 +48,7 @@ class RunningStatTest(unittest.TestCase):
|
||||
|
||||
class MSFTest(unittest.TestCase):
|
||||
def testBasic(self):
|
||||
for shape in [(), (3,), (3, 4, 4)]:
|
||||
for shape in [(), (3, ), (3, 4, 4)]:
|
||||
filt = MeanStdFilter(shape)
|
||||
for i in range(5):
|
||||
filt(np.ones(shape))
|
||||
@@ -93,8 +93,10 @@ class FilterManagerTest(unittest.TestCase):
|
||||
remote_e = RemoteEvaluator.remote(sample_count=10)
|
||||
remote_e.sample.remote()
|
||||
|
||||
FilterManager.synchronize(
|
||||
{"obs_filter": filt1, "rew_filter": filt1.copy()}, [remote_e])
|
||||
FilterManager.synchronize({
|
||||
"obs_filter": filt1,
|
||||
"rew_filter": filt1.copy()
|
||||
}, [remote_e])
|
||||
|
||||
filters = ray.get(remote_e.get_filters.remote())
|
||||
obs_f = filters["obs_filter"]
|
||||
|
||||
@@ -10,22 +10,15 @@ from ray.rllib.models.lstm import chop_into_sequences
|
||||
class LSTMUtilsTest(unittest.TestCase):
|
||||
def testBasic(self):
|
||||
t = [1, 2, 3, 1, 2, 3, 4, 5]
|
||||
f = [
|
||||
[101, 102, 103, 201, 202, 203, 204, 205],
|
||||
[[101], [102], [103], [201], [202], [203], [204], [205]]
|
||||
]
|
||||
f = [[101, 102, 103, 201, 202, 203, 204, 205],
|
||||
[[101], [102], [103], [201], [202], [203], [204], [205]]]
|
||||
s = [[209, 208, 207, 109, 108, 107, 106, 105]]
|
||||
f_pad, s_init, seq_lens = chop_into_sequences(t, f, s, 4)
|
||||
self.assertEqual(
|
||||
[f.tolist() for f in f_pad],
|
||||
[
|
||||
[101, 102, 103, 0,
|
||||
201, 202, 203, 204,
|
||||
205, 0, 0, 0],
|
||||
[[101], [102], [103], [0],
|
||||
[201], [202], [203], [204],
|
||||
[205], [0], [0], [0]],
|
||||
])
|
||||
self.assertEqual([f.tolist() for f in f_pad], [
|
||||
[101, 102, 103, 0, 201, 202, 203, 204, 205, 0, 0, 0],
|
||||
[[101], [102], [103], [0], [201], [202], [203], [204], [205], [0],
|
||||
[0], [0]],
|
||||
])
|
||||
self.assertEqual([s.tolist() for s in s_init], [[209, 109, 105]])
|
||||
self.assertEqual(seq_lens.tolist(), [3, 4, 1])
|
||||
|
||||
|
||||
@@ -129,12 +129,21 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
|
||||
self.assertEqual(obs, {0: 0, 1: 0, 2: 0, 3: 0})
|
||||
self.assertEqual(rew, {0: 1, 1: 1, 2: 1, 3: 1})
|
||||
self.assertEqual(
|
||||
done,
|
||||
{0: False, 1: False, 2: False, 3: False, "__all__": False})
|
||||
self.assertEqual(done, {
|
||||
0: False,
|
||||
1: False,
|
||||
2: False,
|
||||
3: False,
|
||||
"__all__": False
|
||||
})
|
||||
obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
|
||||
self.assertEqual(
|
||||
done, {0: True, 1: True, 2: True, 3: True, "__all__": True})
|
||||
self.assertEqual(done, {
|
||||
0: True,
|
||||
1: True,
|
||||
2: True,
|
||||
3: True,
|
||||
"__all__": True
|
||||
})
|
||||
|
||||
def testRoundRobinMock(self):
|
||||
env = RoundRobinMultiAgent(2)
|
||||
@@ -156,24 +165,51 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
|
||||
self.assertEqual(rew, {0: {0: None, 1: None}, 1: {0: None, 1: None}})
|
||||
self.assertEqual(
|
||||
dones,
|
||||
{0: {0: False, 1: False, "__all__": False},
|
||||
1: {0: False, 1: False, "__all__": False}})
|
||||
dones, {
|
||||
0: {
|
||||
0: False,
|
||||
1: False,
|
||||
"__all__": False
|
||||
},
|
||||
1: {
|
||||
0: False,
|
||||
1: False,
|
||||
"__all__": False
|
||||
}
|
||||
})
|
||||
for _ in range(24):
|
||||
env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
|
||||
obs, rew, dones, _, _ = env.poll()
|
||||
self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
|
||||
self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
|
||||
self.assertEqual(
|
||||
dones,
|
||||
{0: {0: False, 1: False, "__all__": False},
|
||||
1: {0: False, 1: False, "__all__": False}})
|
||||
dones, {
|
||||
0: {
|
||||
0: False,
|
||||
1: False,
|
||||
"__all__": False
|
||||
},
|
||||
1: {
|
||||
0: False,
|
||||
1: False,
|
||||
"__all__": False
|
||||
}
|
||||
})
|
||||
env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
|
||||
obs, rew, dones, _, _ = env.poll()
|
||||
self.assertEqual(
|
||||
dones,
|
||||
{0: {0: True, 1: True, "__all__": True},
|
||||
1: {0: True, 1: True, "__all__": True}})
|
||||
dones, {
|
||||
0: {
|
||||
0: True,
|
||||
1: True,
|
||||
"__all__": True
|
||||
},
|
||||
1: {
|
||||
0: True,
|
||||
1: True,
|
||||
"__all__": True
|
||||
}
|
||||
})
|
||||
|
||||
# Reset processing
|
||||
self.assertRaises(
|
||||
@@ -186,9 +222,18 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
|
||||
self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
|
||||
self.assertEqual(
|
||||
dones,
|
||||
{0: {0: False, 1: False, "__all__": False},
|
||||
1: {0: False, 1: False, "__all__": False}})
|
||||
dones, {
|
||||
0: {
|
||||
0: False,
|
||||
1: False,
|
||||
"__all__": False
|
||||
},
|
||||
1: {
|
||||
0: False,
|
||||
1: False,
|
||||
"__all__": False
|
||||
}
|
||||
})
|
||||
|
||||
def testVectorizeRoundRobin(self):
|
||||
env = _MultiAgentEnvToAsync(lambda: RoundRobinMultiAgent(2), [], 2)
|
||||
@@ -217,9 +262,8 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
self.assertEqual(batch.count, 50)
|
||||
self.assertEqual(batch.policy_batches["p0"].count, 150)
|
||||
self.assertEqual(batch.policy_batches["p1"].count, 100)
|
||||
self.assertEqual(
|
||||
batch.policy_batches["p0"]["t"].tolist(),
|
||||
list(range(25)) * 6)
|
||||
self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
|
||||
list(range(25)) * 6)
|
||||
|
||||
def testMultiAgentSampleRoundRobin(self):
|
||||
act_space = gym.spaces.Discrete(2)
|
||||
@@ -236,21 +280,16 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
# since we round robin introduce agents into the env, some of the env
|
||||
# steps don't count as proper transitions
|
||||
self.assertEqual(batch.policy_batches["p0"].count, 42)
|
||||
self.assertEqual(
|
||||
batch.policy_batches["p0"]["obs"].tolist()[:10],
|
||||
[0, 1, 2, 3, 4] * 2)
|
||||
self.assertEqual(
|
||||
batch.policy_batches["p0"]["new_obs"].tolist()[:10],
|
||||
[1, 2, 3, 4, 5] * 2)
|
||||
self.assertEqual(
|
||||
batch.policy_batches["p0"]["rewards"].tolist()[:10],
|
||||
[100, 100, 100, 100, 0] * 2)
|
||||
self.assertEqual(
|
||||
batch.policy_batches["p0"]["dones"].tolist()[:10],
|
||||
[False, False, False, False, True] * 2)
|
||||
self.assertEqual(
|
||||
batch.policy_batches["p0"]["t"].tolist()[:10],
|
||||
[4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
|
||||
self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10],
|
||||
[0, 1, 2, 3, 4] * 2)
|
||||
self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10],
|
||||
[1, 2, 3, 4, 5] * 2)
|
||||
self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
|
||||
[100, 100, 100, 100, 0] * 2)
|
||||
self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
|
||||
[False, False, False, False, True] * 2)
|
||||
self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10],
|
||||
[4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
|
||||
|
||||
def testTrainMultiCartpoleSinglePolicy(self):
|
||||
n = 10
|
||||
@@ -289,11 +328,17 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
|
||||
batch_steps=50)
|
||||
if optimizer_cls == AsyncGradientsOptimizer:
|
||||
remote_evs = [PolicyEvaluator.as_remote().remote(
|
||||
env_creator=lambda _: MultiCartpole(n),
|
||||
policy_graph=policies,
|
||||
policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
|
||||
batch_steps=50)]
|
||||
|
||||
def policy_mapper(agent_id):
|
||||
return ["p1", "p2"][agent_id % 2]
|
||||
|
||||
remote_evs = [
|
||||
PolicyEvaluator.as_remote().remote(
|
||||
env_creator=lambda _: MultiCartpole(n),
|
||||
policy_graph=policies,
|
||||
policy_mapping_fn=policy_mapper,
|
||||
batch_steps=50)
|
||||
]
|
||||
else:
|
||||
remote_evs = []
|
||||
optimizer = optimizer_cls(ev, remote_evs, {})
|
||||
@@ -330,8 +375,8 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
obs_space = env.observation_space
|
||||
policies = {}
|
||||
for i in range(20):
|
||||
policies["pg_{}".format(i)] = (
|
||||
PGPolicyGraph, obs_space, act_space, {})
|
||||
policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
|
||||
{})
|
||||
policy_ids = list(policies.keys())
|
||||
ev = PolicyEvaluator(
|
||||
env_creator=lambda _: MultiCartpole(n),
|
||||
|
||||
@@ -21,8 +21,8 @@ class AsyncOptimizerTest(unittest.TestCase):
|
||||
local = _MockEvaluator()
|
||||
remotes = ray.remote(_MockEvaluator)
|
||||
remote_evaluators = [remotes.remote() for i in range(5)]
|
||||
test_optimizer = AsyncGradientsOptimizer(
|
||||
local, remote_evaluators, {"grads_per_step": 10})
|
||||
test_optimizer = AsyncGradientsOptimizer(local, remote_evaluators,
|
||||
{"grads_per_step": 10})
|
||||
test_optimizer.step()
|
||||
self.assertTrue(all(local.get_weights() == 0))
|
||||
|
||||
|
||||
@@ -66,8 +66,7 @@ class MockEnv2(gym.Env):
|
||||
|
||||
class MockVectorEnv(VectorEnv):
|
||||
def __init__(self, episode_length, num_envs):
|
||||
self.envs = [
|
||||
MockEnv(episode_length) for _ in range(num_envs)]
|
||||
self.envs = [MockEnv(episode_length) for _ in range(num_envs)]
|
||||
self.observation_space = gym.spaces.Discrete(1)
|
||||
self.action_space = gym.spaces.Discrete(2)
|
||||
self.num_envs = num_envs
|
||||
@@ -102,7 +101,10 @@ class TestPolicyEvaluator(unittest.TestCase):
|
||||
def testQueryEvaluators(self):
|
||||
register_env("test", lambda _: gym.make("CartPole-v0"))
|
||||
pg = PGAgent(
|
||||
env="test", config={"num_workers": 2, "sample_batch_size": 5})
|
||||
env="test", config={
|
||||
"num_workers": 2,
|
||||
"sample_batch_size": 5
|
||||
})
|
||||
results = pg.optimizer.foreach_evaluator(lambda ev: ev.batch_steps)
|
||||
results2 = pg.optimizer.foreach_evaluator_with_index(
|
||||
lambda ev, i: (i, ev.batch_steps))
|
||||
@@ -112,10 +114,12 @@ class TestPolicyEvaluator(unittest.TestCase):
|
||||
def testMetrics(self):
|
||||
ev = PolicyEvaluator(
|
||||
env_creator=lambda _: MockEnv(episode_length=10),
|
||||
policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_mode="complete_episodes")
|
||||
remote_ev = PolicyEvaluator.as_remote().remote(
|
||||
env_creator=lambda _: MockEnv(episode_length=10),
|
||||
policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_mode="complete_episodes")
|
||||
ev.sample()
|
||||
ray.get(remote_ev.sample.remote())
|
||||
result = collect_metrics(ev, [remote_ev])
|
||||
@@ -149,7 +153,8 @@ class TestPolicyEvaluator(unittest.TestCase):
|
||||
env_creator=lambda _: MockEnv(episode_length=20),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_mode="truncate_episodes",
|
||||
batch_steps=16, num_envs=8)
|
||||
batch_steps=16,
|
||||
num_envs=8)
|
||||
for _ in range(8):
|
||||
batch = ev.sample()
|
||||
self.assertEqual(batch.count, 16)
|
||||
@@ -175,7 +180,8 @@ class TestPolicyEvaluator(unittest.TestCase):
|
||||
env_creator=lambda _: MockEnv(episode_length=8),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_mode="truncate_episodes",
|
||||
batch_steps=16, num_envs=4)
|
||||
batch_steps=16,
|
||||
num_envs=4)
|
||||
batch = ev.sample()
|
||||
self.assertEqual(batch.count, 16)
|
||||
result = collect_metrics(ev, [])
|
||||
@@ -186,8 +192,7 @@ class TestPolicyEvaluator(unittest.TestCase):
|
||||
|
||||
def testVectorEnvSupport(self):
|
||||
ev = PolicyEvaluator(
|
||||
env_creator=lambda _: MockVectorEnv(
|
||||
episode_length=20, num_envs=8),
|
||||
env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
|
||||
policy_graph=MockPolicyGraph,
|
||||
batch_mode="truncate_episodes",
|
||||
batch_steps=10)
|
||||
|
||||
@@ -83,8 +83,8 @@ class MultiServing(ServingEnv):
|
||||
def __init__(self, env_creator):
|
||||
self.env_creator = env_creator
|
||||
self.env = env_creator()
|
||||
ServingEnv.__init__(
|
||||
self, self.env.action_space, self.env.observation_space)
|
||||
ServingEnv.__init__(self, self.env.action_space,
|
||||
self.env.observation_space)
|
||||
|
||||
def run(self):
|
||||
envs = [self.env_creator() for _ in range(5)]
|
||||
@@ -97,8 +97,7 @@ class MultiServing(ServingEnv):
|
||||
eids[i] = uuid.uuid4().hex
|
||||
self.start_episode(episode_id=eids[i])
|
||||
cur_obs[i] = envs[i].reset()
|
||||
actions = [
|
||||
self.get_action(eids[i], cur_obs[i]) for i in active]
|
||||
actions = [self.get_action(eids[i], cur_obs[i]) for i in active]
|
||||
for i, action in zip(active, actions):
|
||||
obs, reward, done, _ = envs[i].step(action)
|
||||
cur_obs[i] = obs
|
||||
@@ -164,8 +163,7 @@ class TestServingEnv(unittest.TestCase):
|
||||
raise Exception("failed to improve reward")
|
||||
|
||||
def testTrainCartpole(self):
|
||||
register_env(
|
||||
"test", lambda _: SimpleServing(gym.make("CartPole-v0")))
|
||||
register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0")))
|
||||
pg = PGAgent(env="test", config={"num_workers": 0})
|
||||
for i in range(100):
|
||||
result = pg.train()
|
||||
@@ -176,8 +174,8 @@ class TestServingEnv(unittest.TestCase):
|
||||
raise Exception("failed to improve reward")
|
||||
|
||||
def testTrainCartpoleMulti(self):
|
||||
register_env(
|
||||
"test2", lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
|
||||
register_env("test2",
|
||||
lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
|
||||
pg = PGAgent(env="test2", config={"num_workers": 0})
|
||||
for i in range(100):
|
||||
result = pg.train()
|
||||
|
||||
@@ -14,27 +14,29 @@ from ray.tune.registry import register_env
|
||||
|
||||
ACTION_SPACES_TO_TEST = {
|
||||
"discrete": Discrete(5),
|
||||
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
"simple_tuple": Tuple([
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32)
|
||||
]),
|
||||
"implicit_tuple": [
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)],
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32)
|
||||
],
|
||||
}
|
||||
|
||||
OBSERVATION_SPACES_TO_TEST = {
|
||||
"discrete": Discrete(5),
|
||||
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
"image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
|
||||
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
|
||||
"atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
|
||||
"atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32),
|
||||
"simple_tuple": Tuple([
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
"mixed_tuple": Tuple([
|
||||
Discrete(10),
|
||||
Box(0.0, 1.0, (5,), dtype=np.float32)]),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32)
|
||||
]),
|
||||
"mixed_tuple": Tuple(
|
||||
[Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]),
|
||||
}
|
||||
|
||||
|
||||
@@ -90,30 +92,33 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
stats = {}
|
||||
check_support("DDPG", {"timesteps_per_iteration": 1}, stats)
|
||||
check_support("DQN", {"timesteps_per_iteration": 1}, stats)
|
||||
check_support("A3C", {
|
||||
"num_workers": 1,
|
||||
"optimizer": {
|
||||
"grads_per_step": 1
|
||||
}
|
||||
}, stats)
|
||||
check_support(
|
||||
"A3C", {"num_workers": 1, "optimizer": {"grads_per_step": 1}},
|
||||
stats)
|
||||
"PPO", {
|
||||
"num_workers": 1,
|
||||
"num_sgd_iter": 1,
|
||||
"timesteps_per_batch": 1,
|
||||
"sgd_batchsize": 1
|
||||
}, stats)
|
||||
check_support(
|
||||
"PPO",
|
||||
{"num_workers": 1, "num_sgd_iter": 1, "timesteps_per_batch": 1,
|
||||
"sgd_batchsize": 1},
|
||||
stats)
|
||||
check_support(
|
||||
"ES",
|
||||
{"num_workers": 1, "noise_size": 10000000,
|
||||
"episodes_per_batch": 1, "timesteps_per_batch": 1},
|
||||
stats)
|
||||
check_support(
|
||||
"PG",
|
||||
{"num_workers": 1, "optimizer": {}},
|
||||
stats)
|
||||
"ES", {
|
||||
"num_workers": 1,
|
||||
"noise_size": 10000000,
|
||||
"episodes_per_batch": 1,
|
||||
"timesteps_per_batch": 1
|
||||
}, stats)
|
||||
check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
|
||||
num_unexpected_errors = 0
|
||||
for (alg, a_name, o_name), stat in sorted(stats.items()):
|
||||
if stat not in ["ok", "unsupported"]:
|
||||
num_unexpected_errors += 1
|
||||
print(
|
||||
alg, "action_space", a_name, "obs_space", o_name,
|
||||
"result", stat)
|
||||
print(alg, "action_space", a_name, "obs_space", o_name, "result",
|
||||
stat)
|
||||
self.assertEqual(num_unexpected_errors, 0)
|
||||
|
||||
|
||||
@@ -123,7 +128,7 @@ if __name__ == "__main__":
|
||||
"discrete": Discrete(5),
|
||||
}
|
||||
OBSERVATION_SPACES_TO_TEST = {
|
||||
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
|
||||
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
|
||||
}
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
+24
-11
@@ -11,7 +11,6 @@ import ray
|
||||
from ray.tune.config_parser import make_parser, resources_to_json
|
||||
from ray.tune.tune import _make_scheduler, run_experiments
|
||||
|
||||
|
||||
EXAMPLE_USAGE = """
|
||||
Training example via RLlib CLI:
|
||||
rllib train --run DQN --env CartPole-v0
|
||||
@@ -35,29 +34,41 @@ def create_parser(parser_creator=None):
|
||||
|
||||
# See also the base parser definition in ray/tune/config_parser.py
|
||||
parser.add_argument(
|
||||
"--redis-address", default=None, type=str,
|
||||
"--redis-address",
|
||||
default=None,
|
||||
type=str,
|
||||
help="The Redis address of the cluster.")
|
||||
parser.add_argument(
|
||||
"--ray-num-cpus", default=None, type=int,
|
||||
"--ray-num-cpus",
|
||||
default=None,
|
||||
type=int,
|
||||
help="--num-cpus to pass to Ray."
|
||||
" This only has an affect in local mode.")
|
||||
" This only has an affect in local mode.")
|
||||
parser.add_argument(
|
||||
"--ray-num-gpus", default=None, type=int,
|
||||
"--ray-num-gpus",
|
||||
default=None,
|
||||
type=int,
|
||||
help="--num-gpus to pass to Ray."
|
||||
" This only has an affect in local mode.")
|
||||
" This only has an affect in local mode.")
|
||||
parser.add_argument(
|
||||
"--experiment-name", default="default", type=str,
|
||||
"--experiment-name",
|
||||
default="default",
|
||||
type=str,
|
||||
help="Name of the subdirectory under `local_dir` to put results in.")
|
||||
parser.add_argument(
|
||||
"--env", default=None, type=str, help="The gym environment to use.")
|
||||
parser.add_argument(
|
||||
"--queue-trials", action='store_true',
|
||||
"--queue-trials",
|
||||
action='store_true',
|
||||
help=(
|
||||
"Whether to queue trials when the cluster does not currently have "
|
||||
"enough resources to launch one. This should be set to True when "
|
||||
"running on an autoscaling cluster to enable automatic scale-up."))
|
||||
parser.add_argument(
|
||||
"-f", "--config-file", default=None, type=str,
|
||||
"-f",
|
||||
"--config-file",
|
||||
default=None,
|
||||
type=str,
|
||||
help="If specified, use config options from this file. Note that this "
|
||||
"overrides any trial-specific options set via flags above.")
|
||||
return parser
|
||||
@@ -93,9 +104,11 @@ def run(args, parser):
|
||||
|
||||
ray.init(
|
||||
redis_address=args.redis_address,
|
||||
num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus)
|
||||
num_cpus=args.ray_num_cpus,
|
||||
num_gpus=args.ray_num_gpus)
|
||||
run_experiments(
|
||||
experiments, scheduler=_make_scheduler(args),
|
||||
experiments,
|
||||
scheduler=_make_scheduler(args),
|
||||
queue_trials=args.queue_trials)
|
||||
|
||||
|
||||
|
||||
@@ -6,10 +6,8 @@ import re
|
||||
import os
|
||||
import os.path as osp
|
||||
|
||||
|
||||
CONFIG_DIR = osp.join(osp.dirname(osp.abspath(__file__)), "regression_tests")
|
||||
|
||||
|
||||
TEMPLATE = """
|
||||
class Test{name}(Regression):
|
||||
_file = "{filename}"
|
||||
|
||||
@@ -15,7 +15,6 @@ import yaml
|
||||
import ray
|
||||
from ray import tune
|
||||
|
||||
|
||||
CONFIG_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
|
||||
|
||||
@@ -8,7 +8,6 @@ import yaml
|
||||
import ray
|
||||
from ray.tune import run_experiments
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
experiments = {}
|
||||
|
||||
@@ -29,5 +28,4 @@ if __name__ == '__main__':
|
||||
num_failures += 1
|
||||
|
||||
if num_failures:
|
||||
raise Exception(
|
||||
"{} trials did not converge".format(num_failures))
|
||||
raise Exception("{} trials did not converge".format(num_failures))
|
||||
|
||||
@@ -11,10 +11,9 @@ try:
|
||||
import lz4.frame
|
||||
LZ4_ENABLED = True
|
||||
except ImportError:
|
||||
print(
|
||||
"WARNING: lz4 not available, disabling sample compression. "
|
||||
"This will significantly impact RLlib performance. "
|
||||
"To install lz4, run `pip install lz4`.")
|
||||
print("WARNING: lz4 not available, disabling sample compression. "
|
||||
"This will significantly impact RLlib performance. "
|
||||
"To install lz4, run `pip install lz4`.")
|
||||
LZ4_ENABLED = False
|
||||
|
||||
|
||||
|
||||
@@ -59,7 +59,6 @@ class NoFilter(Filter):
|
||||
|
||||
# http://www.johndcook.com/blog/standard_deviation/
|
||||
class RunningStat(object):
|
||||
|
||||
def __init__(self, shape=None):
|
||||
self._n = 0
|
||||
self._M = np.zeros(shape)
|
||||
@@ -227,8 +226,8 @@ class MeanStdFilter(Filter):
|
||||
|
||||
def __repr__(self):
|
||||
return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format(
|
||||
self.shape, self.demean, self.destd,
|
||||
self.clip, self.rs, self.buffer)
|
||||
self.shape, self.demean, self.destd, self.clip, self.rs,
|
||||
self.buffer)
|
||||
|
||||
|
||||
class ConcurrentMeanStdFilter(MeanStdFilter):
|
||||
@@ -242,6 +241,7 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
|
||||
def wrapper(*args, **kwargs):
|
||||
with self._lock:
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
self.__getattribute__ = lock_wrap(self.__getattribute__)
|
||||
@@ -260,8 +260,8 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
|
||||
|
||||
def __repr__(self):
|
||||
return 'ConcurrentMeanStdFilter({}, {}, {}, {}, {}, {})'.format(
|
||||
self.shape, self.demean, self.destd,
|
||||
self.clip, self.rs, self.buffer)
|
||||
self.shape, self.demean, self.destd, self.clip, self.rs,
|
||||
self.buffer)
|
||||
|
||||
|
||||
def get_filter(filter_config, shape):
|
||||
@@ -273,5 +273,4 @@ def get_filter(filter_config, shape):
|
||||
elif filter_config == "NoFilter":
|
||||
return NoFilter()
|
||||
else:
|
||||
raise Exception("Unknown observation_filter: " +
|
||||
str(filter_config))
|
||||
raise Exception("Unknown observation_filter: " + str(filter_config))
|
||||
|
||||
@@ -75,14 +75,14 @@ def _make_handler(serving_env):
|
||||
response["action"] = serving_env.get_action(
|
||||
args["episode_id"], args["observation"])
|
||||
elif command == PolicyClient.LOG_ACTION:
|
||||
serving_env.log_action(
|
||||
args["episode_id"], args["observation"], args["action"])
|
||||
serving_env.log_action(args["episode_id"], args["observation"],
|
||||
args["action"])
|
||||
elif command == PolicyClient.LOG_RETURNS:
|
||||
serving_env.log_returns(
|
||||
args["episode_id"], args["reward"], args["info"])
|
||||
serving_env.log_returns(args["episode_id"], args["reward"],
|
||||
args["info"])
|
||||
elif command == PolicyClient.END_EPISODE:
|
||||
serving_env.end_episode(
|
||||
args["episode_id"], args["observation"])
|
||||
serving_env.end_episode(args["episode_id"],
|
||||
args["observation"])
|
||||
else:
|
||||
raise Exception("Unknown command: {}".format(command))
|
||||
return response
|
||||
|
||||
@@ -7,6 +7,7 @@ class Reshaper(object):
|
||||
This class keeps track of where in the flattened observation space
|
||||
we should be slicing and what the new shapes should be
|
||||
"""
|
||||
|
||||
def __init__(self, env_space):
|
||||
self.shapes = []
|
||||
self.slice_positions = []
|
||||
@@ -24,8 +25,8 @@ class Reshaper(object):
|
||||
if len(self.slice_positions) == 0:
|
||||
self.slice_positions.append(np.product(arr_shape))
|
||||
else:
|
||||
self.slice_positions.append(np.product(arr_shape) +
|
||||
self.slice_positions[-1])
|
||||
self.slice_positions.append(
|
||||
np.product(arr_shape) + self.slice_positions[-1])
|
||||
else:
|
||||
self.shapes.append(np.asarray(env_space.shape))
|
||||
self.slice_positions.append(np.product(env_space.shape))
|
||||
@@ -38,11 +39,11 @@ class Reshaper(object):
|
||||
def split_tensor(self, tensor, axis=-1):
|
||||
# FIXME (ev) This won't work for mixed action distributions like
|
||||
# one agent Gaussian one agent discrete
|
||||
slice_rescale = int(tensor.shape.as_list()[axis] /
|
||||
int(np.sum(self.get_slice_lengths())))
|
||||
return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
|
||||
axis=axis)
|
||||
slice_rescale = int(tensor.shape.as_list()[axis] / int(
|
||||
np.sum(self.get_slice_lengths())))
|
||||
return tf.split(
|
||||
tensor, slice_rescale * self.get_slice_lengths(), axis=axis)
|
||||
|
||||
def split_number(self, number):
|
||||
slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
|
||||
return slice_rescale*self.get_slice_lengths()
|
||||
return slice_rescale * self.get_slice_lengths()
|
||||
|
||||
@@ -39,10 +39,10 @@ def linear_interpolation(l, r, alpha):
|
||||
|
||||
|
||||
class PiecewiseSchedule(object):
|
||||
def __init__(
|
||||
self, endpoints, interpolation=linear_interpolation,
|
||||
outside_value=None):
|
||||
|
||||
def __init__(self,
|
||||
endpoints,
|
||||
interpolation=linear_interpolation,
|
||||
outside_value=None):
|
||||
"""Piecewise schedule.
|
||||
|
||||
endpoints: [(int, int)]
|
||||
|
||||
@@ -64,18 +64,19 @@ def run_timeline(sess, ops, debug_name, feed_dict={}, timeline_dir=None):
|
||||
run_metadata = tf.RunMetadata()
|
||||
start = time.time()
|
||||
fetches = sess.run(
|
||||
ops, options=run_options, run_metadata=run_metadata,
|
||||
ops,
|
||||
options=run_options,
|
||||
run_metadata=run_metadata,
|
||||
feed_dict=feed_dict)
|
||||
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
|
||||
global _count
|
||||
outf = os.path.join(
|
||||
timeline_dir,
|
||||
"timeline-{}-{}-{}.json".format(debug_name, os.getpid(), _count))
|
||||
timeline_dir, "timeline-{}-{}-{}.json".format(
|
||||
debug_name, os.getpid(), _count))
|
||||
_count += 1
|
||||
trace_file = open(outf, "w")
|
||||
print(
|
||||
"Wrote tf timeline ({} s) to {}".format(
|
||||
time.time() - start, os.path.abspath(outf)))
|
||||
print("Wrote tf timeline ({} s) to {}".format(time.time() - start,
|
||||
os.path.abspath(outf)))
|
||||
trace_file.write(trace.generate_chrome_trace_format())
|
||||
else:
|
||||
fetches = sess.run(ops, feed_dict=feed_dict)
|
||||
|
||||
@@ -22,8 +22,8 @@ class WindowStat(object):
|
||||
if not self.count:
|
||||
quantiles = []
|
||||
else:
|
||||
quantiles = np.percentile(
|
||||
self.items[:self.count], [0, 10, 50, 90, 100]).tolist()
|
||||
quantiles = np.percentile(self.items[:self.count],
|
||||
[0, 10, 50, 90, 100]).tolist()
|
||||
return {
|
||||
self.name + "_count": int(self.count),
|
||||
self.name + "_mean": float(np.mean(self.items[:self.count])),
|
||||
|
||||
Reference in New Issue
Block a user