[rllib] format with yapf (#2427)

* initial yapf

* manual fix yapf bugs
This commit is contained in:
Eric Liang
2018-07-19 15:30:36 -07:00
committed by GitHub
parent 24eb140e07
commit d01dc9e22d
86 changed files with 1276 additions and 978 deletions
+12 -5
View File
@@ -17,9 +17,10 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
def _register_all():
for key in ["PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG",
"APEX_DDPG", "__fake", "__sigmoid_fake_data",
"__parameter_tuning"]:
for key in [
"PPO", "ES", "DQN", "APEX", "A3C", "BC", "PG", "DDPG", "APEX_DDPG",
"__fake", "__sigmoid_fake_data", "__parameter_tuning"
]:
from ray.rllib.agents.agent import get_agent_class
register_trainable(key, get_agent_class(key))
@@ -27,6 +28,12 @@ def _register_all():
_register_all()
__all__ = [
"PolicyGraph", "TFPolicyGraph", "PolicyEvaluator", "SampleBatch",
"AsyncVectorEnv", "MultiAgentEnv", "VectorEnv", "ServingEnv",
"PolicyGraph",
"TFPolicyGraph",
"PolicyEvaluator",
"SampleBatch",
"AsyncVectorEnv",
"MultiAgentEnv",
"VectorEnv",
"ServingEnv",
]
+5 -5
View File
@@ -92,15 +92,15 @@ class A3CAgent(Agent):
self.remote_evaluators = self.make_remote_evaluators(
self.env_creator, policy_cls, self.config["num_workers"],
{"num_gpus": 1 if self.config["use_gpu_for_workers"] else 0})
self.optimizer = AsyncGradientsOptimizer(
self.local_evaluator, self.remote_evaluators,
self.config["optimizer"])
self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
self.remote_evaluators,
self.config["optimizer"])
def _train(self):
prev_steps = self.optimizer.num_steps_sampled
self.optimizer.step()
FilterManager.synchronize(
self.local_evaluator.filters, self.remote_evaluators)
FilterManager.synchronize(self.local_evaluator.filters,
self.remote_evaluators)
result = self.optimizer.collect_metrics()
result = result._replace(
timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps)
@@ -14,19 +14,23 @@ from ray.rllib.models.catalog import ModelCatalog
class A3CLoss(object):
def __init__(
self, action_dist, actions, advantages, v_target, vf,
vf_loss_coeff=0.5, entropy_coeff=-0.01):
def __init__(self,
action_dist,
actions,
advantages,
v_target,
vf,
vf_loss_coeff=0.5,
entropy_coeff=-0.01):
log_prob = action_dist.logp(actions)
# The "policy gradients" loss
self.pi_loss = - tf.reduce_sum(log_prob * advantages)
self.pi_loss = -tf.reduce_sum(log_prob * advantages)
delta = vf - v_target
self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta))
self.entropy = tf.reduce_sum(action_dist.entropy())
self.total_loss = (self.pi_loss +
self.vf_loss * vf_loss_coeff +
self.total_loss = (self.pi_loss + self.vf_loss * vf_loss_coeff +
self.entropy * entropy_coeff)
@@ -41,8 +45,8 @@ class A3CPolicyGraph(TFPolicyGraph):
tf.float32, [None] + list(observation_space.shape))
dist_class, logit_dim = ModelCatalog.get_action_dist(
action_space, self.config["model"])
self.model = ModelCatalog.get_model(
self.observations, logit_dim, self.config["model"])
self.model = ModelCatalog.get_model(self.observations, logit_dim,
self.config["model"])
action_dist = dist_class(self.model.outputs)
self.vf = tf.reshape(
linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
@@ -62,9 +66,9 @@ class A3CPolicyGraph(TFPolicyGraph):
action_space))
advantages = tf.placeholder(tf.float32, [None], name="advantages")
v_target = tf.placeholder(tf.float32, [None], name="v_target")
self.loss = A3CLoss(
action_dist, actions, advantages, v_target, self.vf,
self.config["vf_loss_coeff"], self.config["entropy_coeff"])
self.loss = A3CLoss(action_dist, actions, advantages, v_target,
self.vf, self.config["vf_loss_coeff"],
self.config["entropy_coeff"])
# Initialize TFPolicyGraph
loss_in = [
@@ -76,10 +80,16 @@ class A3CPolicyGraph(TFPolicyGraph):
self.state_in = self.model.state_in
self.state_out = self.model.state_out
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
obs_input=self.observations, action_sampler=action_dist.sample(),
loss=self.loss.total_loss, loss_inputs=loss_in,
state_inputs=self.state_in, state_outputs=self.state_out,
self,
observation_space,
action_space,
self.sess,
obs_input=self.observations,
action_sampler=action_dist.sample(),
loss=self.loss.total_loss,
loss_inputs=loss_in,
state_inputs=self.state_in,
state_outputs=self.state_out,
seq_lens=self.model.seq_lens,
max_seq_len=self.config["model"]["max_seq_len"])
@@ -132,5 +142,5 @@ class A3CPolicyGraph(TFPolicyGraph):
for i in range(len(self.state_in)):
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
last_r = self.value(sample_batch["new_obs"][-1], *next_state)
return compute_advantages(
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
return compute_advantages(sample_batch, last_r, self.config["gamma"],
self.config["lambda"])
@@ -46,20 +46,21 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
action_space, self.config["model"])
self.model = ModelCatalog.get_torch_model(
obs_space.shape, self.logit_dim, self.config["model"])
loss = A3CLoss(
self.model, self.config["vf_loss_coeff"],
self.config["entropy_coeff"])
loss = A3CLoss(self.model, self.config["vf_loss_coeff"],
self.config["entropy_coeff"])
TorchPolicyGraph.__init__(
self, obs_space, action_space, self.model, loss,
loss_inputs=[
"obs", "actions", "advantages", "value_targets"])
self,
obs_space,
action_space,
self.model,
loss,
loss_inputs=["obs", "actions", "advantages", "value_targets"])
def extra_action_out(self, model_out):
return {"vf_preds": var_to_np(model_out[1])}
def optimizer(self):
return torch.optim.Adam(
self.model.parameters(), lr=self.config["lr"])
return torch.optim.Adam(self.model.parameters(), lr=self.config["lr"])
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
completed = sample_batch["dones"][-1]
@@ -67,8 +68,8 @@ class A3CTorchPolicyGraph(TorchPolicyGraph):
last_r = 0.0
else:
last_r = self._value(sample_batch["new_obs"][-1])
return compute_advantages(
sample_batch, last_r, self.config["gamma"], self.config["lambda"])
return compute_advantages(sample_batch, last_r, self.config["gamma"],
self.config["lambda"])
def _value(self, obs):
with self.lock:
+33 -28
View File
@@ -47,7 +47,9 @@ COMMON_CONFIG = {
"allow_growth": True,
},
"log_device_placement": False,
"device_count": {"CPU": 1},
"device_count": {
"CPU": 1
},
"allow_soft_placement": True, # required by PPO multi-gpu
},
# Whether to LZ4 compress observations
@@ -86,8 +88,7 @@ def _deep_update(original, new_dict, new_keys_allowed, whitelist):
for k, value in new_dict.items():
if k not in original and k != "env":
if not new_keys_allowed:
raise Exception(
"Unknown config parameter `{}` ".format(k))
raise Exception("Unknown config parameter `{}` ".format(k))
if type(original.get(k)) is dict:
if k in whitelist:
_deep_update(original[k], value, True, [])
@@ -112,22 +113,24 @@ class Agent(Trainable):
_allow_unknown_configs = False
_allow_unknown_subkeys = [
"tf_session_args", "env_config", "model", "optimizer", "multiagent"]
"tf_session_args", "env_config", "model", "optimizer", "multiagent"
]
def make_local_evaluator(self, env_creator, policy_graph):
"""Convenience method to return configured local evaluator."""
return self._make_evaluator(
PolicyEvaluator, env_creator, policy_graph, 0)
return self._make_evaluator(PolicyEvaluator, env_creator, policy_graph,
0)
def make_remote_evaluators(
self, env_creator, policy_graph, count, remote_args):
def make_remote_evaluators(self, env_creator, policy_graph, count,
remote_args):
"""Convenience method to return a number of remote evaluators."""
cls = PolicyEvaluator.as_remote(**remote_args).remote
return [
self._make_evaluator(cls, env_creator, policy_graph, i+1)
for i in range(count)]
self._make_evaluator(cls, env_creator, policy_graph, i + 1)
for i in range(count)
]
def _make_evaluator(self, cls, env_creator, policy_graph, worker_index):
config = self.config
@@ -140,8 +143,8 @@ class Agent(Trainable):
env_creator,
self.config["multiagent"]["policy_graphs"] or policy_graph,
policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"],
tf_session_creator=(
session_creator if config["tf_session_args"] else None),
tf_session_creator=(session_creator
if config["tf_session_args"] else None),
batch_steps=config["sample_batch_size"],
batch_mode=config["batch_mode"],
episode_horizon=config["horizon"],
@@ -157,14 +160,12 @@ class Agent(Trainable):
@classmethod
def resource_help(cls, config):
return (
"\n\nYou can adjust the resource requests of RLlib agents by "
"setting `num_workers` and other configs. See the "
"DEFAULT_CONFIG defined by each agent for more info.\n\n"
"The config of this agent is: " + json.dumps(config))
return ("\n\nYou can adjust the resource requests of RLlib agents by "
"setting `num_workers` and other configs. See the "
"DEFAULT_CONFIG defined by each agent for more info.\n\n"
"The config of this agent is: " + json.dumps(config))
def __init__(
self, config=None, env=None, logger_creator=None):
def __init__(self, config=None, env=None, logger_creator=None):
"""Initialize an RLLib agent.
Args:
@@ -235,8 +236,8 @@ class Agent(Trainable):
obs = self.local_evaluator.filters["default"](
observation, update=False)
return self.local_evaluator.for_policy(
lambda p: p.compute_single_action(
obs, state, is_training=False)[0])
lambda p: p.compute_single_action(obs, state, is_training=False)[0]
)
class _MockAgent(Agent):
@@ -257,8 +258,10 @@ class _MockAgent(Agent):
and (self.config["persistent_error"] or not self.restored):
raise Exception("mock error")
return TrainingResult(
episode_reward_mean=10, episode_len_mean=10,
timesteps_this_iter=10, info={})
episode_reward_mean=10,
episode_len_mean=10,
timesteps_this_iter=10,
info={})
def _save(self, checkpoint_dir):
path = os.path.join(checkpoint_dir, "mock_agent.pkl")
@@ -299,9 +302,11 @@ class _SigmoidFakeData(_MockAgent):
v = np.tanh(float(i) / self.config["width"])
v *= self.config["height"]
return TrainingResult(
episode_reward_mean=v, episode_len_mean=v,
episode_reward_mean=v,
episode_len_mean=v,
timesteps_this_iter=self.config["iter_timesteps"],
time_this_iter_s=self.config["iter_time"], info={})
time_this_iter_s=self.config["iter_time"],
info={})
class _ParameterTuningAgent(_MockAgent):
@@ -320,7 +325,8 @@ class _ParameterTuningAgent(_MockAgent):
episode_reward_mean=self.config["reward_amt"] * self.iteration,
episode_len_mean=self.config["reward_amt"],
timesteps_this_iter=self.config["iter_timesteps"],
time_this_iter_s=self.config["iter_time"], info={})
time_this_iter_s=self.config["iter_time"],
info={})
def get_agent_class(alg):
@@ -363,5 +369,4 @@ def get_agent_class(alg):
elif alg == "__parameter_tuning":
return _ParameterTuningAgent
else:
raise Exception(
("Unknown algorithm {}.").format(alg))
raise Exception(("Unknown algorithm {}.").format(alg))
+12 -9
View File
@@ -57,28 +57,31 @@ class BCAgent(Agent):
else:
num_gpus_per_worker = 0
return Resources(
cpu=1, gpu=cf["gpu"] and 1 or 0,
cpu=1,
gpu=cf["gpu"] and 1 or 0,
extra_cpu=cf["num_workers"],
extra_gpu=num_gpus_per_worker * cf["num_workers"])
def _init(self):
self.local_evaluator = BCEvaluator(
self.env_creator, self.config, self.logdir)
self.local_evaluator = BCEvaluator(self.env_creator, self.config,
self.logdir)
if self.config["use_gpu_for_workers"]:
remote_cls = GPURemoteBCEvaluator
else:
remote_cls = RemoteBCEvaluator
self.remote_evaluators = [
remote_cls.remote(self.env_creator, self.config, self.logdir)
for _ in range(self.config["num_workers"])]
self.optimizer = AsyncGradientsOptimizer(
self.local_evaluator, self.remote_evaluators,
self.config["optimizer"])
for _ in range(self.config["num_workers"])
]
self.optimizer = AsyncGradientsOptimizer(self.local_evaluator,
self.remote_evaluators,
self.config["optimizer"])
def _train(self):
self.optimizer.step()
metric_lists = [re.get_metrics.remote() for re in
self.remote_evaluators]
metric_lists = [
re.get_metrics.remote() for re in self.remote_evaluators
]
total_samples = 0
total_loss = 0
for metrics in metric_lists:
+7 -6
View File
@@ -14,8 +14,8 @@ from ray.rllib.models import ModelCatalog
class BCEvaluator(EvaluatorInterface):
def __init__(self, env_creator, config, logdir):
env = ModelCatalog.get_preprocessor_as_wrapper(env_creator(
config["env_config"]), config["model"])
env = ModelCatalog.get_preprocessor_as_wrapper(
env_creator(config["env_config"]), config["model"])
self.dataset = ExperienceDataset(config["dataset_path"])
self.policy = BCPolicy(env.observation_space, env.action_space, config)
self.config = config
@@ -27,8 +27,10 @@ class BCEvaluator(EvaluatorInterface):
def compute_gradients(self, samples):
gradient, info = self.policy.compute_gradients(samples)
self.metrics_queue.put(
{"num_samples": info["num_samples"], "loss": info["loss"]})
self.metrics_queue.put({
"num_samples": info["num_samples"],
"loss": info["loss"]
})
return gradient, {}
def apply_gradients(self, grads):
@@ -42,8 +44,7 @@ class BCEvaluator(EvaluatorInterface):
def save(self):
weights = self.get_weights()
return pickle.dumps({
"weights": weights})
return pickle.dumps({"weights": weights})
def restore(self, objs):
objs = pickle.loads(objs)
@@ -21,8 +21,9 @@ class ExperienceDataset(object):
elements.
The file must be available on each machine used by a BCEvaluator.
"""
self._dataset = list(itertools.chain.from_iterable(
pickle.load(open(dataset_path, "rb"))))
self._dataset = list(
itertools.chain.from_iterable(
pickle.load(open(dataset_path, "rb"))))
def sample(self, batch_size):
indexes = np.random.choice(len(self._dataset), batch_size)
+19 -18
View File
@@ -23,8 +23,8 @@ class BCPolicy(object):
self.x = tf.placeholder(tf.float32, [None] + list(obs_space.shape))
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
ac_space, self.config["model"])
self._model = ModelCatalog.get_model(
self.x, self.logit_dim, self.config["model"])
self._model = ModelCatalog.get_model(self.x, self.logit_dim,
self.config["model"])
self.logits = self._model.outputs
self.curr_dist = dist_class(self.logits)
self.sample = self.curr_dist.sample()
@@ -33,17 +33,16 @@ class BCPolicy(object):
def setup_loss(self, action_space):
if isinstance(action_space, gym.spaces.Box):
self.ac = tf.placeholder(tf.float32,
[None] + list(action_space.shape),
name="ac")
self.ac = tf.placeholder(
tf.float32, [None] + list(action_space.shape), name="ac")
elif isinstance(action_space, gym.spaces.Discrete):
self.ac = tf.placeholder(tf.int64, [None], name="ac")
else:
raise NotImplementedError(
"action space" + str(type(action_space)) +
"currently not supported")
raise NotImplementedError("action space" +
str(type(action_space)) +
"currently not supported")
log_prob = self.curr_dist.logp(self.ac)
self.pi_loss = - tf.reduce_sum(log_prob)
self.pi_loss = -tf.reduce_sum(log_prob)
self.loss = self.pi_loss
def setup_gradients(self):
@@ -62,11 +61,14 @@ class BCPolicy(object):
self.summary_op = tf.summary.merge_all()
# TODO(rliaw): Can consider exposing these parameters
self.sess = tf.Session(graph=self.g, config=tf.ConfigProto(
intra_op_parallelism_threads=1, inter_op_parallelism_threads=2,
gpu_options=tf.GPUOptions(allow_growth=True)))
self.variables = ray.experimental.TensorFlowVariables(self.loss,
self.sess)
self.sess = tf.Session(
graph=self.g,
config=tf.ConfigProto(
intra_op_parallelism_threads=1,
inter_op_parallelism_threads=2,
gpu_options=tf.GPUOptions(allow_growth=True)))
self.variables = ray.experimental.TensorFlowVariables(
self.loss, self.sess)
self.sess.run(tf.global_variables_initializer())
def compute_gradients(self, samples):
@@ -82,15 +84,14 @@ class BCPolicy(object):
[self.loss, self.grads, self.summary_op], feed_dict=feed_dict)
info["summary"] = summ
else:
loss, grad = self.sess.run([self.loss, self.grads],
feed_dict=feed_dict)
loss, grad = self.sess.run(
[self.loss, self.grads], feed_dict=feed_dict)
info["num_samples"] = len(samples)
info["loss"] = loss
return grad, info
def apply_gradients(self, grads):
feed_dict = {self.grads[i]: grads[i]
for i in range(len(grads))}
feed_dict = {self.grads[i]: grads[i] for i in range(len(grads))}
self.sess.run(self._apply_gradients, feed_dict=feed_dict)
def get_weights(self):
+6 -7
View File
@@ -9,13 +9,12 @@ APEX_DDPG_DEFAULT_CONFIG = merge_dicts(
DDPG_CONFIG,
{
"optimizer_class": "AsyncSamplesOptimizer",
"optimizer":
merge_dicts(
DDPG_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"optimizer": merge_dicts(
DDPG_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"n_step": 3,
"num_workers": 32,
"buffer_size": 2000000,
+3 -3
View File
@@ -118,9 +118,9 @@ class DDPGAgent(DQNAgent):
if self.config["per_worker_exploration"]:
assert self.config["num_workers"] > 1, \
"This requires multiple workers"
return ConstantSchedule(
self.config["noise_scale"] * 0.4 **
(1 + worker_index / float(self.config["num_workers"] - 1) * 7))
exponent = (
1 + worker_index / float(self.config["num_workers"] - 1) * 7)
return ConstantSchedule(self.config["noise_scale"] * 0.4**exponent)
else:
return LinearSchedule(
schedule_timesteps=int(self.config["exploration_fraction"] *
@@ -14,7 +14,6 @@ from ray.rllib.models import ModelCatalog
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
A_SCOPE = "a_func"
P_SCOPE = "p_func"
P_TARGET_SCOPE = "target_p_func"
@@ -26,8 +25,8 @@ class PNetwork(object):
"""Maps an observations (i.e., state) to an action where each entry takes
value from (0, 1) due to the sigmoid function."""
def __init__(
self, model, dim_actions, hiddens=[64, 64], activation="relu"):
def __init__(self, model, dim_actions, hiddens=[64, 64],
activation="relu"):
action_out = model.last_layer
activation = tf.nn.__dict__[activation]
for hidden in hiddens:
@@ -44,9 +43,14 @@ class ActionNetwork(object):
for training, thus ignoring the batch_size issue when constructing a
stochastic action."""
def __init__(
self, p_values, low_action, high_action, stochastic, eps,
theta=0.15, sigma=0.2):
def __init__(self,
p_values,
low_action,
high_action,
stochastic,
eps,
theta=0.15,
sigma=0.2):
# shape is [None, dim_action]
deterministic_actions = (
@@ -65,15 +69,16 @@ class ActionNetwork(object):
stochastic_actions = deterministic_actions + eps * (
high_action - low_action) * exploration_value
self.actions = tf.cond(
stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
self.actions = tf.cond(stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
class QNetwork(object):
def __init__(
self, model, action_inputs,
hiddens=[64, 64], activation="relu"):
def __init__(self,
model,
action_inputs,
hiddens=[64, 64],
activation="relu"):
q_out = tf.concat([model.last_layer, action_inputs], axis=1)
activation = tf.nn.__dict__[activation]
for hidden in hiddens:
@@ -84,14 +89,21 @@ class QNetwork(object):
class ActorCriticLoss(object):
def __init__(
self, q_t, q_tp1, q_tp0, importance_weights, rewards, done_mask,
gamma=0.99, n_step=1, use_huber=False, huber_threshold=1.0):
def __init__(self,
q_t,
q_tp1,
q_tp0,
importance_weights,
rewards,
done_mask,
gamma=0.99,
n_step=1,
use_huber=False,
huber_threshold=1.0):
q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1)
q_tp1_best = tf.squeeze(
input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1)
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
@@ -131,27 +143,20 @@ class DDPGPolicyGraph(TFPolicyGraph):
def _build_q_network(obs, actions):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]),
actions,
ModelCatalog.get_model(obs, 1, config["model"]), actions,
config["critic_hiddens"],
config["critic_hidden_activation"]).value
def _build_p_network(obs):
return PNetwork(
ModelCatalog.get_model(obs, 1, config["model"]),
dim_actions,
ModelCatalog.get_model(obs, 1, config["model"]), dim_actions,
config["actor_hiddens"],
config["actor_hidden_activation"]).action_scores
def _build_action_network(p_values, stochastic, eps):
return ActionNetwork(
p_values,
low_action,
high_action,
stochastic,
eps,
config["exploration_theta"],
config["exploration_sigma"]).actions
return ActionNetwork(p_values, low_action, high_action, stochastic,
eps, config["exploration_theta"],
config["exploration_sigma"]).actions
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
@@ -263,9 +268,13 @@ class DDPGPolicyGraph(TFPolicyGraph):
("weights", self.importance_weights),
]
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
self,
observation_space,
action_space,
self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss.total_loss,
action_sampler=self.output_actions,
loss=self.loss.total_loss,
loss_inputs=self.loss_inputs)
self.sess.run(tf.global_variables_initializer())
@@ -294,10 +303,10 @@ class DDPGPolicyGraph(TFPolicyGraph):
self.loss.actor_loss, var_list=self.p_func_vars)
critic_grads_and_vars = self.critic_optimizer.compute_gradients(
self.loss.critic_loss, var_list=self.q_func_vars)
actor_grads_and_vars = [
(g, v) for (g, v) in actor_grads_and_vars if g is not None]
critic_grads_and_vars = [
(g, v) for (g, v) in critic_grads_and_vars if g is not None]
actor_grads_and_vars = [(g, v) for (g, v) in actor_grads_and_vars
if g is not None]
critic_grads_and_vars = [(g, v) for (g, v) in critic_grads_and_vars
if g is not None]
grads_and_vars = actor_grads_and_vars + critic_grads_and_vars
return grads_and_vars
+6 -7
View File
@@ -10,13 +10,12 @@ APEX_DEFAULT_CONFIG = merge_dicts(
DQN_CONFIG,
{
"optimizer_class": "AsyncSamplesOptimizer",
"optimizer":
merge_dicts(
DQN_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"optimizer": merge_dicts(
DQN_CONFIG["optimizer"], {
"max_weight_sync_delay": 400,
"num_replay_buffer_shards": 4,
"debug": False
}),
"n_step": 3,
"gpu": True,
"num_workers": 32,
+25 -23
View File
@@ -13,11 +13,11 @@ from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
from ray.tune.trial import Resources
OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "prioritized_replay_eps", "sample_batch_size",
"train_batch_size", "learning_starts", "clip_rewards"]
"train_batch_size", "learning_starts", "clip_rewards"
]
DEFAULT_CONFIG = with_common_config({
# === Model ===
@@ -110,7 +110,8 @@ class DQNAgent(Agent):
def default_resource_request(cls, config):
cf = dict(cls._default_config, **config)
return Resources(
cpu=1, gpu=cf["gpu"] and 1 or 0,
cpu=1,
gpu=cf["gpu"] and 1 or 0,
extra_cpu=cf["num_cpus_per_worker"] * cf["num_workers"],
extra_gpu=cf["num_gpus_per_worker"] * cf["num_workers"])
@@ -123,7 +124,8 @@ class DQNAgent(Agent):
self.exploration0 = self._make_exploration_schedule(0)
self.explorations = [
self._make_exploration_schedule(i)
for i in range(self.config["num_workers"])]
for i in range(self.config["num_workers"])
]
for k in OPTIMIZER_SHARED_CONFIGS:
if k not in self.config["optimizer"]:
@@ -132,9 +134,10 @@ class DQNAgent(Agent):
self.local_evaluator = self.make_local_evaluator(
self.env_creator, self._policy_graph)
self.remote_evaluators = self.make_remote_evaluators(
self.env_creator, self._policy_graph, self.config["num_workers"],
{"num_cpus": self.config["num_cpus_per_worker"],
"num_gpus": self.config["num_gpus_per_worker"]})
self.env_creator, self._policy_graph, self.config["num_workers"], {
"num_cpus": self.config["num_cpus_per_worker"],
"num_gpus": self.config["num_gpus_per_worker"]
})
self.optimizer = getattr(optimizers, self.config["optimizer_class"])(
self.local_evaluator, self.remote_evaluators,
self.config["optimizer"])
@@ -147,14 +150,12 @@ class DQNAgent(Agent):
if self.config["per_worker_exploration"]:
assert self.config["num_workers"] > 1, \
"This requires multiple workers"
return ConstantSchedule(
0.4 ** (
1 + worker_index / float(
self.config["num_workers"] - 1) * 7))
exponent = (
1 + worker_index / float(self.config["num_workers"] - 1) * 7)
return ConstantSchedule(0.4**exponent)
return LinearSchedule(
schedule_timesteps=int(
self.config["exploration_fraction"] *
self.config["schedule_max_timesteps"]),
schedule_timesteps=int(self.config["exploration_fraction"] *
self.config["schedule_max_timesteps"]),
initial_p=1.0,
final_p=self.config["exploration_final_eps"])
@@ -191,8 +192,8 @@ class DQNAgent(Agent):
self.local_evaluator,
self.remote_evaluators[-len(self.remote_evaluators) // 3:])
else:
result = collect_metrics(
self.local_evaluator, self.remote_evaluators)
result = collect_metrics(self.local_evaluator,
self.remote_evaluators)
return result._replace(
timesteps_this_iter=self.global_timestep - start_timestep,
@@ -208,14 +209,14 @@ class DQNAgent(Agent):
ev.__ray_terminate__.remote()
def _save(self, checkpoint_dir):
checkpoint_path = os.path.join(
checkpoint_dir, "checkpoint-{}".format(self.iteration))
checkpoint_path = os.path.join(checkpoint_dir,
"checkpoint-{}".format(self.iteration))
extra_data = [
self.local_evaluator.save(),
ray.get([e.save.remote() for e in self.remote_evaluators]),
self.optimizer.save(),
self.num_target_updates,
self.last_target_update_ts]
self.optimizer.save(), self.num_target_updates,
self.last_target_update_ts
]
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
return checkpoint_path
@@ -223,8 +224,9 @@ class DQNAgent(Agent):
extra_data = pickle.load(open(checkpoint_path + ".extra_data", "rb"))
self.local_evaluator.restore(extra_data[0])
ray.get([
e.restore.remote(d) for (d, e)
in zip(extra_data[1], self.remote_evaluators)])
e.restore.remote(d)
for (d, e) in zip(extra_data[1], self.remote_evaluators)
])
self.optimizer.restore(extra_data[2])
self.num_target_updates = extra_data[3]
self.last_target_update_ts = extra_data[4]
+62 -50
View File
@@ -13,7 +13,6 @@ from ray.rllib.evaluation.sample_batch import SampleBatch
from ray.rllib.utils.error import UnsupportedSpaceException
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
Q_SCOPE = "q_func"
Q_TARGET_SCOPE = "target_q_func"
@@ -33,7 +32,8 @@ class QNetwork(object):
state_out = model.last_layer
for hidden in hiddens:
state_out = layers.fully_connected(
state_out, num_outputs=hidden,
state_out,
num_outputs=hidden,
activation_fn=tf.nn.relu)
state_score = layers.fully_connected(
state_out, num_outputs=1, activation_fn=None)
@@ -50,26 +50,32 @@ class QValuePolicy(object):
deterministic_actions = tf.argmax(q_values, axis=1)
batch_size = tf.shape(observations)[0]
random_actions = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=num_actions,
tf.stack([batch_size]),
minval=0,
maxval=num_actions,
dtype=tf.int64)
chose_random = tf.random_uniform(
tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < eps
stochastic_actions = tf.where(
chose_random, random_actions, deterministic_actions)
self.action = tf.cond(
stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
stochastic_actions = tf.where(chose_random, random_actions,
deterministic_actions)
self.action = tf.cond(stochastic, lambda: stochastic_actions,
lambda: deterministic_actions)
class QLoss(object):
def __init__(
self, q_t_selected, q_tp1_best, importance_weights, rewards,
done_mask, gamma=0.99, n_step=1):
def __init__(self,
q_t_selected,
q_tp1_best,
importance_weights,
rewards,
done_mask,
gamma=0.99,
n_step=1):
q_tp1_best_masked = (1.0 - done_mask) * q_tp1_best
# compute RHS of bellman equation
q_t_selected_target = rewards + gamma ** n_step * q_tp1_best_masked
q_t_selected_target = rewards + gamma**n_step * q_tp1_best_masked
# compute the error (potentially clipped)
self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target)
@@ -91,14 +97,14 @@ class DQNPolicyGraph(TFPolicyGraph):
def _build_q_network(obs):
return QNetwork(
ModelCatalog.get_model(obs, 1, config["model"]),
num_actions, config["dueling"], config["hiddens"]).value
ModelCatalog.get_model(obs, 1, config["model"]), num_actions,
config["dueling"], config["hiddens"]).value
# Action inputs
self.stochastic = tf.placeholder(tf.bool, (), name="stochastic")
self.eps = tf.placeholder(tf.float32, (), name="eps")
self.cur_observations = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
tf.float32, shape=(None, ) + observation_space.shape)
# Action Q network
with tf.variable_scope(Q_SCOPE) as scope:
@@ -106,20 +112,17 @@ class DQNPolicyGraph(TFPolicyGraph):
self.q_func_vars = _scope_vars(scope.name)
# Action outputs
self.output_actions = QValuePolicy(
q_values,
self.cur_observations,
num_actions,
self.stochastic,
self.eps).action
self.output_actions = QValuePolicy(q_values, self.cur_observations,
num_actions, self.stochastic,
self.eps).action
# Replay inputs
self.obs_t = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
tf.float32, shape=(None, ) + observation_space.shape)
self.act_t = tf.placeholder(tf.int32, [None], name="action")
self.rew_t = tf.placeholder(tf.float32, [None], name="reward")
self.obs_tp1 = tf.placeholder(
tf.float32, shape=(None,) + observation_space.shape)
tf.float32, shape=(None, ) + observation_space.shape)
self.done_mask = tf.placeholder(tf.float32, [None], name="done")
self.importance_weights = tf.placeholder(
tf.float32, [None], name="weight")
@@ -134,8 +137,8 @@ class DQNPolicyGraph(TFPolicyGraph):
self.target_q_func_vars = _scope_vars(scope.name)
# q scores for actions which we know were selected in the given state.
q_t_selected = tf.reduce_sum(
q_t * tf.one_hot(self.act_t, num_actions), 1)
q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions),
1)
# compute estimate of best possible value starting from state at t + 1
if config["double_q"]:
@@ -143,20 +146,20 @@ class DQNPolicyGraph(TFPolicyGraph):
q_tp1_using_online_net = _build_q_network(self.obs_tp1)
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
q_tp1_best = tf.reduce_sum(
q_tp1 * tf.one_hot(
q_tp1_best_using_online_net, num_actions), 1)
q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions),
1)
else:
q_tp1_best = tf.reduce_max(q_tp1, 1)
self.loss = QLoss(
q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, config["gamma"], config["n_step"])
self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights,
self.rew_t, self.done_mask, config["gamma"],
config["n_step"])
# update_target_fn will be called periodically to copy Q network to
# target Q network
update_target_expr = []
for var, var_target in zip(
sorted(self.q_func_vars, key=lambda v: v.name),
sorted(self.q_func_vars, key=lambda v: v.name),
sorted(self.target_q_func_vars, key=lambda v: v.name)):
update_target_expr.append(var_target.assign(var))
self.update_target_expr = tf.group(*update_target_expr)
@@ -172,9 +175,13 @@ class DQNPolicyGraph(TFPolicyGraph):
("weights", self.importance_weights),
]
TFPolicyGraph.__init__(
self, observation_space, action_space, self.sess,
self,
observation_space,
action_space,
self.sess,
obs_input=self.cur_observations,
action_sampler=self.output_actions, loss=self.loss.loss,
action_sampler=self.output_actions,
loss=self.loss.loss,
loss_inputs=self.loss_inputs)
self.sess.run(tf.global_variables_initializer())
@@ -184,13 +191,14 @@ class DQNPolicyGraph(TFPolicyGraph):
def gradients(self, optimizer):
if self.config["grad_norm_clipping"] is not None:
grads_and_vars = _minimize_and_clip(
optimizer, self.loss.loss, var_list=self.q_func_vars,
optimizer,
self.loss.loss,
var_list=self.q_func_vars,
clip_val=self.config["grad_norm_clipping"])
else:
grads_and_vars = optimizer.compute_gradients(
self.loss.loss, var_list=self.q_func_vars)
grads_and_vars = [
(g, v) for (g, v) in grads_and_vars if g is not None]
grads_and_vars = [(g, v) for (g, v) in grads_and_vars if g is not None]
return grads_and_vars
def extra_compute_action_feed_dict(self):
@@ -207,8 +215,8 @@ class DQNPolicyGraph(TFPolicyGraph):
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
return _postprocess_dqn(self, sample_batch)
def compute_td_error(
self, obs_t, act_t, rew_t, obs_tp1, done_mask, importance_weights):
def compute_td_error(self, obs_t, act_t, rew_t, obs_tp1, done_mask,
importance_weights):
td_err = self.sess.run(
self.loss.td_error,
feed_dict={
@@ -254,7 +262,7 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
continue # episode end
for j in range(1, n_step):
new_obs[i] = new_obs[i + j]
rewards[i] += gamma ** j * rewards[i + j]
rewards[i] += gamma**j * rewards[i + j]
if dones[i + j]:
break # episode end
# truncate ends of the trajectory
@@ -266,24 +274,29 @@ def adjust_nstep(n_step, gamma, obs, actions, rewards, new_obs, dones):
def _postprocess_dqn(policy_graph, sample_batch):
obs, actions, rewards, new_obs, dones = [
list(x) for x in sample_batch.columns(
["obs", "actions", "rewards", "new_obs", "dones"])]
["obs", "actions", "rewards", "new_obs", "dones"])
]
# N-step Q adjustments
if policy_graph.config["n_step"] > 1:
adjust_nstep(
policy_graph.config["n_step"], policy_graph.config["gamma"],
obs, actions, rewards, new_obs, dones)
adjust_nstep(policy_graph.config["n_step"],
policy_graph.config["gamma"], obs, actions, rewards,
new_obs, dones)
batch = SampleBatch({
"obs": obs, "actions": actions, "rewards": rewards,
"new_obs": new_obs, "dones": dones,
"weights": np.ones_like(rewards)})
"obs": obs,
"actions": actions,
"rewards": rewards,
"new_obs": new_obs,
"dones": dones,
"weights": np.ones_like(rewards)
})
# Prioritize on the worker side
if batch.count > 0 and policy_graph.config["worker_side_prioritization"]:
td_errors = policy_graph.compute_td_error(
batch["obs"], batch["actions"], batch["rewards"],
batch["new_obs"], batch["dones"], batch["weights"])
batch["obs"], batch["actions"], batch["rewards"], batch["new_obs"],
batch["dones"], batch["weights"])
new_priorities = (
np.abs(td_errors) + policy_graph.config["prioritized_replay_eps"])
batch.data["weights"] = new_priorities
@@ -295,8 +308,7 @@ def _huber_loss(x, delta=1.0):
"""Reference: https://en.wikipedia.org/wiki/Huber_loss"""
return tf.where(
tf.abs(x) < delta,
tf.square(x) * 0.5,
delta * (tf.abs(x) - 0.5 * delta))
tf.square(x) * 0.5, delta * (tf.abs(x) - 0.5 * delta))
def _minimize_and_clip(optimizer, objective, var_list, clip_val=10):
+32 -36
View File
@@ -20,13 +20,11 @@ from ray.rllib.agents.es import policies
from ray.rllib.agents.es import tabular_logger as tlogger
from ray.rllib.agents.es import utils
Result = namedtuple("Result", [
"noise_indices", "noisy_returns", "sign_noisy_returns", "noisy_lengths",
"eval_returns", "eval_lengths"
])
DEFAULT_CONFIG = {
'l2_coeff': 0.005,
'noise_stdev': 0.02,
@@ -64,7 +62,11 @@ class SharedNoiseTable(object):
@ray.remote
class Worker(object):
def __init__(self, config, policy_params, env_creator, noise,
def __init__(self,
config,
policy_params,
env_creator,
noise,
min_task_runtime=0.2):
self.min_task_runtime = min_task_runtime
self.config = config
@@ -82,7 +84,9 @@ class Worker(object):
def rollout(self, timestep_limit, add_noise=True):
rollout_rewards, rollout_length = policies.rollout(
self.policy, self.env, timestep_limit=timestep_limit,
self.policy,
self.env,
timestep_limit=timestep_limit,
add_noise=add_noise)
return rollout_rewards, rollout_length
@@ -95,8 +99,8 @@ class Worker(object):
# Perform some rollouts with noise.
task_tstart = time.time()
while (len(noise_indices) == 0 or
time.time() - task_tstart < self.min_task_runtime):
while (len(noise_indices) == 0
or time.time() - task_tstart < self.min_task_runtime):
if np.random.uniform() < self.config["eval_prob"]:
# Do an evaluation run with no perturbation.
@@ -122,7 +126,8 @@ class Worker(object):
noise_indices.append(noise_index)
returns.append([rewards_pos.sum(), rewards_neg.sum()])
sign_returns.append(
[np.sign(rewards_pos).sum(), np.sign(rewards_neg).sum()])
[np.sign(rewards_pos).sum(),
np.sign(rewards_neg).sum()])
lengths.append([lengths_pos, lengths_neg])
return Result(
@@ -146,9 +151,7 @@ class ESAgent(Agent):
return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"])
def _init(self):
policy_params = {
"action_noise_std": 0.01
}
policy_params = {"action_noise_std": 0.01}
env = self.env_creator(self.config["env_config"])
from ray.rllib import models
@@ -168,9 +171,9 @@ class ESAgent(Agent):
# Create the actors.
print("Creating actors.")
self.workers = [
Worker.remote(
self.config, policy_params, self.env_creator, noise_id)
for _ in range(self.config["num_workers"])]
Worker.remote(self.config, policy_params, self.env_creator,
noise_id) for _ in range(self.config["num_workers"])
]
self.episodes_so_far = 0
self.timesteps_so_far = 0
@@ -180,21 +183,20 @@ class ESAgent(Agent):
num_episodes, num_timesteps = 0, 0
results = []
while num_episodes < min_episodes or num_timesteps < min_timesteps:
print(
"Collected {} episodes {} timesteps so far this iter".format(
num_episodes, num_timesteps))
rollout_ids = [worker.do_rollouts.remote(theta_id)
for worker in self.workers]
print("Collected {} episodes {} timesteps so far this iter".format(
num_episodes, num_timesteps))
rollout_ids = [
worker.do_rollouts.remote(theta_id) for worker in self.workers
]
# Get the results of the rollouts.
for result in ray.get(rollout_ids):
results.append(result)
# Update the number of episodes and the number of timesteps
# keeping in mind that result.noisy_lengths is a list of lists,
# where the inner lists have length 2.
num_episodes += sum(len(pair) for pair
in result.noisy_lengths)
num_timesteps += sum(sum(pair) for pair
in result.noisy_lengths)
num_episodes += sum(len(pair) for pair in result.noisy_lengths)
num_timesteps += sum(
sum(pair) for pair in result.noisy_lengths)
return results, num_episodes, num_timesteps
def _train(self):
@@ -209,8 +211,7 @@ class ESAgent(Agent):
# Use the actors to do rollouts, note that we pass in the ID of the
# policy weights.
results, num_episodes, num_timesteps = self._collect_results(
theta_id,
config["episodes_per_batch"],
theta_id, config["episodes_per_batch"],
config["timesteps_per_batch"])
all_noise_indices = []
@@ -255,13 +256,11 @@ class ESAgent(Agent):
for index in noise_indices),
batch_size=500)
g /= noisy_returns.size
assert (
g.shape == (self.policy.num_params,) and
g.dtype == np.float32 and
count == len(noise_indices))
assert (g.shape == (self.policy.num_params, ) and g.dtype == np.float32
and count == len(noise_indices))
# Compute the new weights theta.
theta, update_ratio = self.optimizer.update(
-g + config["l2_coeff"] * theta)
theta, update_ratio = self.optimizer.update(-g +
config["l2_coeff"] * theta)
# Set the new weights in the local copy of the policy.
self.policy.set_weights(theta)
@@ -313,13 +312,10 @@ class ESAgent(Agent):
w.__ray_terminate__.remote()
def _save(self, checkpoint_dir):
checkpoint_path = os.path.join(
checkpoint_dir, "checkpoint-{}".format(self.iteration))
checkpoint_path = os.path.join(checkpoint_dir,
"checkpoint-{}".format(self.iteration))
weights = self.policy.get_weights()
objects = [
weights,
self.episodes_so_far,
self.timesteps_so_far]
objects = [weights, self.episodes_so_far, self.timesteps_so_far]
pickle.dump(objects, open(checkpoint_path, "wb"))
return checkpoint_path
+2 -2
View File
@@ -48,8 +48,8 @@ class Adam(Optimizer):
self.v = np.zeros(self.dim, dtype=np.float32)
def _compute_step(self, globalg):
a = self.stepsize * (np.sqrt(1 - self.beta2 ** self.t) /
(1 - self.beta1 ** self.t))
a = self.stepsize * (np.sqrt(1 - self.beta2**self.t) /
(1 - self.beta1**self.t))
self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
step = -a * self.m / (np.sqrt(self.v) + self.epsilon)
+13 -13
View File
@@ -21,8 +21,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
noise drawn from that stream. Otherwise, no action noise will be added.
"""
env_timestep_limit = env.spec.max_episode_steps
timestep_limit = (env_timestep_limit if timestep_limit is None
else min(timestep_limit, env_timestep_limit))
timestep_limit = (env_timestep_limit if timestep_limit is None else min(
timestep_limit, env_timestep_limit))
rews = []
t = 0
observation = env.reset()
@@ -38,16 +38,16 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
class GenericPolicy(object):
def __init__(self, sess, action_space, preprocessor,
observation_filter, action_noise_std):
def __init__(self, sess, action_space, preprocessor, observation_filter,
action_noise_std):
self.sess = sess
self.action_space = action_space
self.action_noise_std = action_noise_std
self.preprocessor = preprocessor
self.observation_filter = get_filter(
observation_filter, self.preprocessor.shape)
self.inputs = tf.placeholder(
tf.float32, [None] + list(self.preprocessor.shape))
self.observation_filter = get_filter(observation_filter,
self.preprocessor.shape)
self.inputs = tf.placeholder(tf.float32,
[None] + list(self.preprocessor.shape))
# Policy network.
dist_class, dist_dim = ModelCatalog.get_action_dist(
@@ -59,16 +59,16 @@ class GenericPolicy(object):
self.variables = ray.experimental.TensorFlowVariables(
model.outputs, self.sess)
self.num_params = sum(np.prod(variable.shape.as_list())
for _, variable
in self.variables.variables.items())
self.num_params = sum(
np.prod(variable.shape.as_list())
for _, variable in self.variables.variables.items())
self.sess.run(tf.global_variables_initializer())
def compute(self, observation, add_noise=False, update=True):
observation = self.preprocessor.transform(observation)
observation = self.observation_filter(observation[None], update=update)
action = self.sess.run(self.sampler,
feed_dict={self.inputs: observation})
action = self.sess.run(
self.sampler, feed_dict={self.inputs: observation})
if add_noise and isinstance(self.action_space, gym.spaces.Box):
action += np.random.randn(*action.shape) * self.action_noise_std
return action
+10 -6
View File
@@ -25,6 +25,7 @@ DISABLED = 50
class TbWriter(object):
"""Based on SummaryWriter, but changed to allow for a different prefix."""
def __init__(self, dir, prefix):
self.dir = dir
# Start at 1, because EvWriter automatically generates an object with
@@ -34,9 +35,10 @@ class TbWriter(object):
compat.as_bytes(os.path.join(dir, prefix)))
def write_values(self, key2val):
summary = tf.Summary(value=[tf.Summary.Value(tag=k,
simple_value=float(v))
for (k, v) in key2val.items()])
summary = tf.Summary(value=[
tf.Summary.Value(tag=k, simple_value=float(v))
for (k, v) in key2val.items()
])
event = event_pb2.Event(wall_time=time.time(), summary=summary)
event.step = self.step
self.evwriter.WriteEvent(event)
@@ -46,6 +48,7 @@ class TbWriter(object):
def close(self):
self.evwriter.Close()
# API
@@ -126,6 +129,7 @@ def get_expt_dir():
sys.stderr.write("get_expt_dir() is Deprecated. Switch to get_dir()\n")
return get_dir()
# Backend
@@ -167,8 +171,8 @@ class _Logger(object):
# Write to all text outputs
self._write_text("-" * (keywidth + valwidth + 7), "\n")
for (key, val) in key2str.items():
self._write_text("| ", key, " " * (keywidth - len(key)),
" | ", val, " " * (valwidth - len(val)), " |\n")
self._write_text("| ", key, " " * (keywidth - len(key)), " | ",
val, " " * (valwidth - len(val)), " |\n")
self._write_text("-" * (keywidth + valwidth + 7), "\n")
for f in self.text_outputs:
try:
@@ -202,7 +206,7 @@ class _Logger(object):
# Misc
def _do_log(self, *args):
self._write_text(*args + ('\n',))
self._write_text(*args + ('\n', ))
for f in self.text_outputs:
try:
f.flush()
+8 -6
View File
@@ -31,8 +31,9 @@ def compute_centered_ranks(x):
def make_session(single_threaded):
if not single_threaded:
return tf.Session()
return tf.Session(config=tf.ConfigProto(inter_op_parallelism_threads=1,
intra_op_parallelism_threads=1))
return tf.Session(
config=tf.ConfigProto(
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
def itergroups(items, group_size):
@@ -50,10 +51,11 @@ def itergroups(items, group_size):
def batched_weighted_sum(weights, vecs, batch_size):
total = 0
num_items_summed = 0
for batch_weights, batch_vecs in zip(itergroups(weights, batch_size),
itergroups(vecs, batch_size)):
for batch_weights, batch_vecs in zip(
itergroups(weights, batch_size), itergroups(vecs, batch_size)):
assert len(batch_weights) == len(batch_vecs) <= batch_size
total += np.dot(np.asarray(batch_weights, dtype=np.float32),
np.asarray(batch_vecs, dtype=np.float32))
total += np.dot(
np.asarray(batch_weights, dtype=np.float32),
np.asarray(batch_vecs, dtype=np.float32))
num_items_summed += len(batch_weights)
return total, num_items_summed
+3 -4
View File
@@ -7,7 +7,6 @@ from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph
from ray.rllib.optimizers import SyncSamplesOptimizer
from ray.tune.trial import Resources
DEFAULT_CONFIG = with_common_config({
# No remote workers by default
"num_workers": 0,
@@ -43,9 +42,9 @@ class PGAgent(Agent):
self.env_creator, PGPolicyGraph)
self.remote_evaluators = self.make_remote_evaluators(
self.env_creator, PGPolicyGraph, self.config["num_workers"], {})
self.optimizer = SyncSamplesOptimizer(
self.local_evaluator, self.remote_evaluators,
self.config["optimizer"])
self.optimizer = SyncSamplesOptimizer(self.local_evaluator,
self.remote_evaluators,
self.config["optimizer"])
def _train(self):
prev_steps = self.optimizer.num_steps_sampled
@@ -42,9 +42,15 @@ class PGPolicyGraph(TFPolicyGraph):
]
TFPolicyGraph.__init__(
self, obs_space, action_space, sess, obs_input=obs,
action_sampler=action_dist.sample(), loss=loss,
loss_inputs=loss_in, state_inputs=self.model.state_in,
self,
obs_space,
action_space,
sess,
obs_input=obs,
action_sampler=action_dist.sample(),
loss=loss,
loss_inputs=loss_in,
state_inputs=self.model.state_in,
state_outputs=self.model.state_out,
seq_lens=self.model.seq_lens,
max_seq_len=config["model"]["max_seq_len"])
+16 -15
View File
@@ -77,28 +77,30 @@ class PPOAgent(Agent):
self.local_evaluator = self.make_local_evaluator(
self.env_creator, PPOPolicyGraph)
self.remote_evaluators = self.make_remote_evaluators(
self.env_creator, PPOPolicyGraph, self.config["num_workers"],
{"num_cpus": self.config["num_cpus_per_worker"],
"num_gpus": self.config["num_gpus_per_worker"]})
self.env_creator, PPOPolicyGraph, self.config["num_workers"], {
"num_cpus": self.config["num_cpus_per_worker"],
"num_gpus": self.config["num_gpus_per_worker"]
})
if self.config["simple_optimizer"]:
self.optimizer = SyncSamplesOptimizer(
self.local_evaluator, self.remote_evaluators,
{"num_sgd_iter": self.config["num_sgd_iter"]})
else:
self.optimizer = LocalMultiGPUOptimizer(
self.local_evaluator, self.remote_evaluators,
{"sgd_batch_size": self.config["sgd_batchsize"],
"sgd_stepsize": self.config["sgd_stepsize"],
"num_sgd_iter": self.config["num_sgd_iter"],
"timesteps_per_batch": self.config["timesteps_per_batch"],
"standardize_fields": ["advantages"]})
self.local_evaluator, self.remote_evaluators, {
"sgd_batch_size": self.config["sgd_batchsize"],
"sgd_stepsize": self.config["sgd_stepsize"],
"num_sgd_iter": self.config["num_sgd_iter"],
"timesteps_per_batch": self.config["timesteps_per_batch"],
"standardize_fields": ["advantages"]
})
def _train(self):
prev_steps = self.optimizer.num_steps_sampled
fetches = self.optimizer.step()
self.local_evaluator.for_policy(lambda pi: pi.update_kl(fetches["kl"]))
FilterManager.synchronize(
self.local_evaluator.filters, self.remote_evaluators)
FilterManager.synchronize(self.local_evaluator.filters,
self.remote_evaluators)
res = self.optimizer.collect_metrics()
res = res._replace(
timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps,
@@ -115,9 +117,7 @@ class PPOAgent(Agent):
"checkpoint-{}".format(self.iteration))
agent_state = ray.get(
[a.save.remote() for a in self.remote_evaluators])
extra_data = [
self.local_evaluator.save(),
agent_state]
extra_data = [self.local_evaluator.save(), agent_state]
pickle.dump(extra_data, open(checkpoint_path + ".extra_data", "wb"))
return checkpoint_path
@@ -126,4 +126,5 @@ class PPOAgent(Agent):
self.local_evaluator.restore(extra_data[0])
ray.get([
a.restore.remote(o)
for (a, o) in zip(self.remote_evaluators, extra_data[1])])
for (a, o) in zip(self.remote_evaluators, extra_data[1])
])
+71 -34
View File
@@ -10,10 +10,20 @@ from ray.rllib.models.catalog import ModelCatalog
class PPOLoss(object):
def __init__(
self, action_space, value_targets, advantages, actions, logits,
vf_preds, curr_action_dist, value_fn, cur_kl_coeff,
entropy_coeff=0, clip_param=0.1, vf_loss_coeff=1.0, use_gae=True):
def __init__(self,
action_space,
value_targets,
advantages,
actions,
logits,
vf_preds,
curr_action_dist,
value_fn,
cur_kl_coeff,
entropy_coeff=0,
clip_param=0.1,
vf_loss_coeff=1.0,
use_gae=True):
"""Constructs the loss for Proximal Policy Objective.
Arguments:
@@ -51,31 +61,33 @@ class PPOLoss(object):
surrogate_loss = tf.minimum(
advantages * logp_ratio,
advantages * tf.clip_by_value(
logp_ratio, 1 - clip_param, 1 + clip_param))
advantages * tf.clip_by_value(logp_ratio, 1 - clip_param,
1 + clip_param))
self.mean_policy_loss = tf.reduce_mean(-surrogate_loss)
if use_gae:
vf_loss1 = tf.square(value_fn - value_targets)
vf_clipped = vf_preds + tf.clip_by_value(
value_fn - vf_preds, -clip_param, clip_param)
vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
-clip_param, clip_param)
vf_loss2 = tf.square(vf_clipped - value_targets)
vf_loss = tf.maximum(vf_loss1, vf_loss2)
self.mean_vf_loss = tf.reduce_mean(vf_loss)
loss = tf.reduce_mean(
-surrogate_loss + cur_kl_coeff*action_kl +
vf_loss_coeff*vf_loss - entropy_coeff*curr_entropy)
loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl +
vf_loss_coeff * vf_loss -
entropy_coeff * curr_entropy)
else:
self.mean_vf_loss = tf.constant(0.0)
loss = tf.reduce_mean(
-surrogate_loss + cur_kl_coeff*action_kl -
entropy_coeff*curr_entropy)
loss = tf.reduce_mean(-surrogate_loss + cur_kl_coeff * action_kl -
entropy_coeff * curr_entropy)
self.loss = loss
class PPOPolicyGraph(TFPolicyGraph):
def __init__(self, observation_space, action_space,
config, existing_inputs=None):
def __init__(self,
observation_space,
action_space,
config,
existing_inputs=None):
"""
Arguments:
observation_space: Environment observation space specification.
@@ -98,16 +110,18 @@ class PPOPolicyGraph(TFPolicyGraph):
existing_seq_lens = existing_inputs[-1]
else:
obs_ph = tf.placeholder(
tf.float32, name="obs", shape=(None,)+observation_space.shape)
tf.float32,
name="obs",
shape=(None, ) + observation_space.shape)
adv_ph = tf.placeholder(
tf.float32, name="advantages", shape=(None,))
tf.float32, name="advantages", shape=(None, ))
act_ph = ModelCatalog.get_action_placeholder(action_space)
logits_ph = tf.placeholder(
tf.float32, name="logits", shape=(None, logit_dim))
vf_preds_ph = tf.placeholder(
tf.float32, name="vf_preds", shape=(None,))
tf.float32, name="vf_preds", shape=(None, ))
value_targets_ph = tf.placeholder(
tf.float32, name="value_targets", shape=(None,))
tf.float32, name="value_targets", shape=(None, ))
existing_state_in = None
existing_seq_lens = None
@@ -120,13 +134,19 @@ class PPOPolicyGraph(TFPolicyGraph):
("vf_preds", vf_preds_ph),
]
self.model = ModelCatalog.get_model(
obs_ph, logit_dim, self.config["model"],
state_in=existing_state_in, seq_lens=existing_seq_lens)
obs_ph,
logit_dim,
self.config["model"],
state_in=existing_state_in,
seq_lens=existing_seq_lens)
# KL Coefficient
self.kl_coeff = tf.get_variable(
initializer=tf.constant_initializer(self.kl_coeff_val),
name="kl_coeff", shape=(), trainable=False, dtype=tf.float32)
name="kl_coeff",
shape=(),
trainable=False,
dtype=tf.float32)
self.logits = self.model.outputs
curr_action_dist = dist_cls(self.logits)
@@ -146,20 +166,32 @@ class PPOPolicyGraph(TFPolicyGraph):
self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])
self.loss_obj = PPOLoss(
action_space, value_targets_ph, adv_ph, act_ph,
logits_ph, vf_preds_ph,
curr_action_dist, self.value_function, self.kl_coeff,
action_space,
value_targets_ph,
adv_ph,
act_ph,
logits_ph,
vf_preds_ph,
curr_action_dist,
self.value_function,
self.kl_coeff,
entropy_coeff=self.config["entropy_coeff"],
clip_param=self.config["clip_param"],
vf_loss_coeff=self.config["kl_target"],
use_gae=self.config["use_gae"])
TFPolicyGraph.__init__(
self, observation_space, action_space,
self.sess, obs_input=obs_ph,
action_sampler=self.sampler, loss=self.loss_obj.loss,
loss_inputs=self.loss_in, state_inputs=self.model.state_in,
state_outputs=self.model.state_out, seq_lens=self.model.seq_lens,
self,
observation_space,
action_space,
self.sess,
obs_input=obs_ph,
action_sampler=self.sampler,
loss=self.loss_obj.loss,
loss_inputs=self.loss_in,
state_inputs=self.model.state_in,
state_outputs=self.model.state_out,
seq_lens=self.model.seq_lens,
max_seq_len=config["model"]["max_seq_len"])
self.sess.run(tf.global_variables_initializer())
@@ -167,7 +199,9 @@ class PPOPolicyGraph(TFPolicyGraph):
def copy(self, existing_inputs):
"""Creates a copy of self using existing input placeholders."""
return PPOPolicyGraph(
None, self.action_space, self.config,
None,
self.action_space,
self.config,
existing_inputs=existing_inputs)
def extra_compute_action_fetches(self):
@@ -193,8 +227,11 @@ class PPOPolicyGraph(TFPolicyGraph):
def postprocess_trajectory(self, sample_batch, other_agent_batches=None):
last_r = 0.0
batch = compute_advantages(
sample_batch, last_r, self.config["gamma"],
self.config["lambda"], use_gae=self.config["use_gae"])
sample_batch,
last_r,
self.config["gamma"],
self.config["lambda"],
use_gae=self.config["use_gae"])
return batch
def optimizer(self):
+4 -4
View File
@@ -13,7 +13,6 @@ from ray.rllib.agents.ppo.utils import flatten, concatenate
# TODO(ekl): move to rllib/models dir
class DistributionsTest(unittest.TestCase):
def testCategorical(self):
num_samples = 100000
logits = tf.placeholder(tf.float32, shape=(None, 10))
@@ -32,10 +31,11 @@ class DistributionsTest(unittest.TestCase):
class UtilsTest(unittest.TestCase):
def testFlatten(self):
d = {"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
"a": np.array([[[5], [-5]], [[6], [-6]]])}
d = {
"s": np.array([[[1, -1], [2, -2]], [[3, -3], [4, -4]]]),
"a": np.array([[[5], [-5]], [[6], [-6]]])
}
flat = flatten(d.copy(), start=0, stop=2)
assert_allclose(d["s"][0][0][:], flat["s"][0][:])
assert_allclose(d["s"][0][1][:], flat["s"][1][:])
+1 -1
View File
@@ -16,7 +16,7 @@ def flatten(weights, start=0, stop=2):
stop: The ending index.
"""
for key, val in weights.items():
new_shape = val.shape[0:start] + (-1,) + val.shape[stop:]
new_shape = val.shape[0:start] + (-1, ) + val.shape[stop:]
weights[key] = val.reshape(new_shape)
return weights
+9 -6
View File
@@ -286,8 +286,8 @@ class _MultiAgentEnvState(object):
self.reset()
def poll(self):
obs, rew, dones, info = (
self.last_obs, self.last_rewards, self.last_dones, self.last_infos)
obs, rew, dones, info = (self.last_obs, self.last_rewards,
self.last_dones, self.last_infos)
self.last_obs = {}
self.last_rewards = {}
self.last_dones = {"__all__": False}
@@ -303,10 +303,13 @@ class _MultiAgentEnvState(object):
def reset(self):
self.last_obs = self.env.reset()
self.last_rewards = {
agent_id: None for agent_id in self.last_obs.keys()}
agent_id: None
for agent_id in self.last_obs.keys()
}
self.last_dones = {
agent_id: False for agent_id in self.last_obs.keys()}
self.last_infos = {
agent_id: {} for agent_id in self.last_obs.keys()}
agent_id: False
for agent_id in self.last_obs.keys()
}
self.last_infos = {agent_id: {} for agent_id in self.last_obs.keys()}
self.last_dones["__all__"] = False
return self.last_obs
+2 -3
View File
@@ -28,8 +28,7 @@ class NoopResetEnv(gym.Wrapper):
if self.override_num_noops is not None:
noops = self.override_num_noops
else:
noops = self.unwrapped.np_random.randint(
1, self.noop_max + 1)
noops = self.unwrapped.np_random.randint(1, self.noop_max + 1)
assert noops > 0
obs = None
for _ in range(noops):
@@ -121,7 +120,7 @@ class MaxAndSkipEnv(gym.Wrapper):
gym.Wrapper.__init__(self, env)
# most recent raw observations (for max pooling across time steps)
self._obs_buffer = np.zeros(
(2,)+env.observation_space.shape, dtype=np.uint8)
(2, ) + env.observation_space.shape, dtype=np.uint8)
self._skip = skip
def step(self, action):
+1 -2
View File
@@ -71,8 +71,7 @@ class _VectorizedGymEnv(VectorEnv):
self.envs = existing_envs
self.num_envs = num_envs
if make_env and num_envs > 1:
self.resetter = _AsyncResetter(
make_env, int(self.num_envs ** 0.5))
self.resetter = _AsyncResetter(make_env, int(self.num_envs**0.5))
else:
self.resetter = _SimpleResetter(make_env)
while len(self.envs) < self.num_envs:
+4 -3
View File
@@ -15,9 +15,10 @@ def collect_metrics(local_evaluator, remote_evaluators=[]):
episode_rewards = []
episode_lengths = []
policy_rewards = collections.defaultdict(list)
metric_lists = ray.get(
[a.apply.remote(lambda ev: ev.sampler.get_metrics())
for a in remote_evaluators])
metric_lists = ray.get([
a.apply.remote(lambda ev: ev.sampler.get_metrics())
for a in remote_evaluators
])
metric_lists.append(local_evaluator.sampler.get_metrics())
for metrics in metric_lists:
for episode in metrics:
+56 -42
View File
@@ -82,24 +82,23 @@ class PolicyEvaluator(EvaluatorInterface):
def as_remote(cls, num_cpus=None, num_gpus=None):
return ray.remote(num_cpus=num_cpus, num_gpus=num_gpus)(cls)
def __init__(
self,
env_creator,
policy_graph,
policy_mapping_fn=None,
tf_session_creator=None,
batch_steps=100,
batch_mode="truncate_episodes",
episode_horizon=None,
preprocessor_pref="rllib",
sample_async=False,
compress_observations=False,
num_envs=1,
observation_filter="NoFilter",
env_config=None,
model_config=None,
policy_config=None,
worker_index=0):
def __init__(self,
env_creator,
policy_graph,
policy_mapping_fn=None,
tf_session_creator=None,
batch_steps=100,
batch_mode="truncate_episodes",
episode_horizon=None,
preprocessor_pref="rllib",
sample_async=False,
compress_observations=False,
num_envs=1,
observation_filter="NoFilter",
env_config=None,
model_config=None,
policy_config=None,
worker_index=0):
"""Initialize a policy evaluator.
Arguments:
@@ -157,8 +156,8 @@ class PolicyEvaluator(EvaluatorInterface):
policy_config = policy_config or {}
self.policy_config = policy_config
model_config = model_config or {}
policy_mapping_fn = (
policy_mapping_fn or (lambda agent_id: DEFAULT_POLICY_ID))
policy_mapping_fn = (policy_mapping_fn
or (lambda agent_id: DEFAULT_POLICY_ID))
self.env_creator = env_creator
self.policy_graph = policy_graph
self.batch_steps = batch_steps
@@ -170,17 +169,21 @@ class PolicyEvaluator(EvaluatorInterface):
isinstance(self.env, ServingEnv) or \
isinstance(self.env, MultiAgentEnv) or \
isinstance(self.env, AsyncVectorEnv):
def wrap(env):
return env # we can't auto-wrap these env types
elif is_atari(self.env) and \
"custom_preprocessor" not in model_config and \
preprocessor_pref == "deepmind":
def wrap(env):
return wrap_deepmind(env, dim=model_config.get("dim", 80))
else:
def wrap(env):
return ModelCatalog.get_preprocessor_as_wrapper(
env, model_config)
self.env = wrap(self.env)
def make_env():
@@ -193,20 +196,21 @@ class PolicyEvaluator(EvaluatorInterface):
if tf_session_creator:
self.tf_sess = tf_session_creator()
else:
self.tf_sess = tf.Session(config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
self.tf_sess = tf.Session(
config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
with self.tf_sess.as_default():
self.policy_map = self._build_policy_map(
policy_dict, policy_config)
else:
self.policy_map = self._build_policy_map(
policy_dict, policy_config)
self.policy_map = self._build_policy_map(policy_dict,
policy_config)
self.multiagent = self.policy_map.keys() != set(DEFAULT_POLICY_ID)
self.filters = {
policy_id: get_filter(
observation_filter, policy.observation_space.shape)
policy_id: get_filter(observation_filter,
policy.observation_space.shape)
for (policy_id, policy) in self.policy_map.items()
}
@@ -226,24 +230,34 @@ class PolicyEvaluator(EvaluatorInterface):
batch_steps = float("inf") # never cut episodes
pack_episodes = False # sampler will return 1 episode per poll
else:
raise ValueError(
"Unsupported batch mode: {}".format(self.batch_mode))
raise ValueError("Unsupported batch mode: {}".format(
self.batch_mode))
if sample_async:
self.sampler = AsyncSampler(
self.async_env, self.policy_map, policy_mapping_fn,
self.filters, batch_steps, horizon=episode_horizon,
pack=pack_episodes, tf_sess=self.tf_sess)
self.async_env,
self.policy_map,
policy_mapping_fn,
self.filters,
batch_steps,
horizon=episode_horizon,
pack=pack_episodes,
tf_sess=self.tf_sess)
self.sampler.start()
else:
self.sampler = SyncSampler(
self.async_env, self.policy_map, policy_mapping_fn,
self.filters, batch_steps, horizon=episode_horizon,
pack=pack_episodes, tf_sess=self.tf_sess)
self.async_env,
self.policy_map,
policy_mapping_fn,
self.filters,
batch_steps,
horizon=episode_horizon,
pack=pack_episodes,
tf_sess=self.tf_sess)
def _build_policy_map(self, policy_dict, policy_config):
policy_map = {}
for name, (cls, obs_space, act_space, conf) in sorted(
policy_dict.items()):
for name, (cls, obs_space, act_space,
conf) in sorted(policy_dict.items()):
merged_conf = policy_config.copy()
merged_conf.update(conf)
with tf.variable_scope(name):
@@ -315,7 +329,8 @@ class PolicyEvaluator(EvaluatorInterface):
def get_weights(self):
return {
pid: policy.get_weights()
for pid, policy in self.policy_map.items()}
for pid, policy in self.policy_map.items()
}
def set_weights(self, weights):
for pid, w in weights.items():
@@ -351,9 +366,7 @@ class PolicyEvaluator(EvaluatorInterface):
builder, grad)
for pid, grad in grads.items()
}
return {
k: builder.get(v) for k, v in outputs.items()
}
return {k: builder.get(v) for k, v in outputs.items()}
else:
return {
pid: self.policy_map[pid].apply_gradients(g)
@@ -428,8 +441,9 @@ def _validate_and_canonicalize(policy_graph, env):
raise ValueError("policy_graph must be a rllib.PolicyGraph class")
else:
return {
DEFAULT_POLICY_ID: (
policy_graph, env.observation_space, env.action_space, {})}
DEFAULT_POLICY_ID: (policy_graph, env.observation_space,
env.action_space, {})
}
def _has_tensorflow_graph(policy_dict):
+5 -2
View File
@@ -45,7 +45,8 @@ class SampleBatchBuilder(object):
"""Returns a sample batch including all previously added values."""
batch = SampleBatch(
{k: to_float_array(v) for k, v in self.buffers.items()})
{k: to_float_array(v)
for k, v in self.buffers.items()})
self.buffers.clear()
self.count = 0
return batch
@@ -69,7 +70,9 @@ class MultiAgentSampleBatchBuilder(object):
self.policy_map = policy_map
self.policy_builders = {
k: SampleBatchBuilder() for k in policy_map.keys()}
k: SampleBatchBuilder()
for k in policy_map.keys()
}
self.agent_builders = {}
self.agent_to_policy = {}
self.count = 0 # increment this manually
+48 -32
View File
@@ -12,12 +12,11 @@ from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \
from ray.rllib.env.async_vector_env import AsyncVectorEnv
from ray.rllib.utils.tf_run_builder import TFRunBuilder
RolloutMetrics = namedtuple(
"RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"])
PolicyEvalData = namedtuple(
"PolicyEvalData", ["env_id", "agent_id", "obs", "rnn_state"])
PolicyEvalData = namedtuple("PolicyEvalData",
["env_id", "agent_id", "obs", "rnn_state"])
class SyncSampler(object):
@@ -29,9 +28,15 @@ class SyncSampler(object):
This class provides data on invocation, rather than on a separate
thread."""
def __init__(
self, env, policies, policy_mapping_fn, obs_filters,
num_local_steps, horizon=None, pack=False, tf_sess=None):
def __init__(self,
env,
policies,
policy_mapping_fn,
obs_filters,
num_local_steps,
horizon=None,
pack=False,
tf_sess=None):
self.async_vector_env = AsyncVectorEnv.wrap_async(env)
self.num_local_steps = num_local_steps
self.horizon = horizon
@@ -68,9 +73,15 @@ class AsyncSampler(threading.Thread):
Note that batch_size is only a unit of measure here. Batches can
accumulate and the gradient can be calculated on up to 5 batches."""
def __init__(
self, env, policies, policy_mapping_fn, obs_filters,
num_local_steps, horizon=None, pack=False, tf_sess=None):
def __init__(self,
env,
policies,
policy_mapping_fn,
obs_filters,
num_local_steps,
horizon=None,
pack=False,
tf_sess=None):
for _, f in obs_filters.items():
assert getattr(f, "is_concurrent", False), \
"Observation Filter must support concurrent updates."
@@ -142,9 +153,14 @@ class AsyncSampler(threading.Thread):
return completed
def _env_runner(
async_vector_env, policies, policy_mapping_fn, num_local_steps,
horizon, obs_filters, pack, tf_sess=None):
def _env_runner(async_vector_env,
policies,
policy_mapping_fn,
num_local_steps,
horizon,
obs_filters,
pack,
tf_sess=None):
"""This implements the common experience collection logic.
Args:
@@ -186,9 +202,11 @@ def _env_runner(
else:
return MultiAgentSampleBatchBuilder(policies)
active_episodes = defaultdict(
lambda: _MultiAgentEpisode(
policies, policy_mapping_fn, get_batch_builder))
def new_episode():
return _MultiAgentEpisode(policies, policy_mapping_fn,
get_batch_builder)
active_episodes = defaultdict(new_episode)
while True:
# Get observations from all ready agents
@@ -213,9 +231,8 @@ def _env_runner(
# Check episode termination conditions
if dones[env_id]["__all__"] or episode.length >= horizon:
all_done = True
yield RolloutMetrics(
episode.length, episode.total_reward,
dict(episode.agent_rewards))
yield RolloutMetrics(episode.length, episode.total_reward,
dict(episode.agent_rewards))
else:
all_done = False
# At least send an empty dict if not done
@@ -228,9 +245,8 @@ def _env_runner(
agent_done = bool(all_done or dones[env_id].get(agent_id))
if not agent_done:
to_eval[policy_id].append(
PolicyEvalData(
env_id, agent_id, filtered_obs,
episode.rnn_state_for(agent_id)))
PolicyEvalData(env_id, agent_id, filtered_obs,
episode.rnn_state_for(agent_id)))
last_observation = episode.last_observation_for(agent_id)
episode.set_last_observation(agent_id, filtered_obs)
@@ -274,13 +290,12 @@ def _env_runner(
episode = active_episodes[env_id]
for agent_id, raw_obs in resetted_obs.items():
policy_id = episode.policy_for(agent_id)
filtered_obs = _get_or_raise(
obs_filters, policy_id)(raw_obs)
filtered_obs = _get_or_raise(obs_filters,
policy_id)(raw_obs)
episode.set_last_observation(agent_id, filtered_obs)
to_eval[policy_id].append(
PolicyEvalData(
env_id, agent_id, filtered_obs,
episode.rnn_state_for(agent_id)))
PolicyEvalData(env_id, agent_id, filtered_obs,
episode.rnn_state_for(agent_id)))
# Batch eval policy actions if possible
if tf_sess:
@@ -295,7 +310,8 @@ def _env_runner(
policy = _get_or_raise(policies, policy_id)
if builder:
eval_results[policy_id] = policy.build_compute_actions(
builder, [t.obs for t in eval_data], rnn_in,
builder, [t.obs for t in eval_data],
rnn_in,
is_training=True)
else:
eval_results[policy_id] = policy.compute_actions(
@@ -319,7 +335,8 @@ def _env_runner(
episode = active_episodes[env_id]
episode.set_rnn_state(agent_id, [c[i] for c in rnn_out_cols])
episode.set_last_pi_info(
agent_id, {k: v[i] for k, v in pi_info_cols.items()})
agent_id, {k: v[i]
for k, v in pi_info_cols.items()})
if env_id in off_policy_actions and \
agent_id in off_policy_actions[env_id]:
episode.set_last_action(
@@ -334,8 +351,7 @@ def _env_runner(
def _to_column_format(rnn_state_rows):
num_cols = len(rnn_state_rows[0])
return [
[row[i] for row in rnn_state_rows] for i in range(num_cols)]
return [[row[i] for row in rnn_state_rows] for i in range(num_cols)]
def _get_or_raise(mapping, policy_id):
@@ -363,8 +379,8 @@ class _MultiAgentEpisode(object):
def add_agent_rewards(self, reward_dict):
for agent_id, reward in reward_dict.items():
if reward is not None:
self.agent_rewards[
agent_id, self.policy_for(agent_id)] += reward
self.agent_rewards[agent_id,
self.policy_for(agent_id)] += reward
self.total_reward += reward
def policy_for(self, agent_id):
+35 -24
View File
@@ -35,10 +35,18 @@ class TFPolicyGraph(PolicyGraph):
SampleBatch({"action": ..., "advantages": ..., ...})
"""
def __init__(
self, observation_space, action_space, sess, obs_input,
action_sampler, loss, loss_inputs, state_inputs=None,
state_outputs=None, seq_lens=None, max_seq_len=20):
def __init__(self,
observation_space,
action_space,
sess,
obs_input,
action_sampler,
loss,
loss_inputs,
state_inputs=None,
state_outputs=None,
seq_lens=None,
max_seq_len=20):
"""Initialize the policy graph.
Arguments:
@@ -78,9 +86,9 @@ class TFPolicyGraph(PolicyGraph):
self._seq_lens = seq_lens
self._max_seq_len = max_seq_len
self._optimizer = self.optimizer()
self._grads_and_vars = [
(g, v) for (g, v) in self.gradients(self._optimizer)
if g is not None]
self._grads_and_vars = [(g, v)
for (g, v) in self.gradients(self._optimizer)
if g is not None]
self._grads = [g for (g, v) in self._grads_and_vars]
self._apply_op = self._optimizer.apply_gradients(self._grads_and_vars)
self._variables = ray.experimental.TensorFlowVariables(
@@ -92,8 +100,11 @@ class TFPolicyGraph(PolicyGraph):
if self._state_inputs:
assert self._seq_lens is not None
def build_compute_actions(
self, builder, obs_batch, state_batches=None, is_training=False):
def build_compute_actions(self,
builder,
obs_batch,
state_batches=None,
is_training=False):
state_batches = state_batches or []
assert len(self._state_inputs) == len(state_batches), \
(self._state_inputs, state_batches)
@@ -103,16 +114,15 @@ class TFPolicyGraph(PolicyGraph):
builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
builder.add_feed_dict({self._is_training: is_training})
builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
fetches = builder.add_fetches(
[self._sampler] + self._state_outputs +
[self.extra_compute_action_fetches()])
fetches = builder.add_fetches([self._sampler] + self._state_outputs +
[self.extra_compute_action_fetches()])
return fetches[0], fetches[1:-1], fetches[-1]
def compute_actions(
self, obs_batch, state_batches=None, is_training=False):
def compute_actions(self, obs_batch, state_batches=None,
is_training=False):
builder = TFRunBuilder(self._sess, "compute_actions")
fetches = self.build_compute_actions(
builder, obs_batch, state_batches, is_training)
fetches = self.build_compute_actions(builder, obs_batch, state_batches,
is_training)
return builder.get(fetches)
def _get_loss_inputs_dict(self, batch):
@@ -127,12 +137,11 @@ class TFPolicyGraph(PolicyGraph):
# RNN case
feature_keys = [k for k, v in self._loss_inputs]
state_keys = [
"state_in_{}".format(i) for i in range(len(self._state_inputs))]
"state_in_{}".format(i) for i in range(len(self._state_inputs))
]
feature_sequences, initial_states, seq_lens = chop_into_sequences(
batch["t"],
[batch[k] for k in feature_keys],
[batch[k] for k in state_keys],
self._max_seq_len)
batch["t"], [batch[k] for k in feature_keys],
[batch[k] for k in state_keys], self._max_seq_len)
for k, v in zip(feature_keys, feature_sequences):
feed_dict[self._loss_input_dict[k]] = v
for k, v in zip(state_keys, initial_states):
@@ -172,9 +181,11 @@ class TFPolicyGraph(PolicyGraph):
builder.add_feed_dict(self.extra_apply_grad_feed_dict())
builder.add_feed_dict(self._get_loss_inputs_dict(postprocessed_batch))
builder.add_feed_dict({self._is_training: True})
fetches = builder.add_fetches(
[self._apply_op, self.extra_compute_grad_fetches(),
self.extra_apply_grad_fetches()])
fetches = builder.add_fetches([
self._apply_op,
self.extra_compute_grad_fetches(),
self.extra_apply_grad_fetches()
])
return fetches[1], fetches[2]
def compute_apply(self, postprocessed_batch):
@@ -27,8 +27,8 @@ class TorchPolicyGraph(PolicyGraph):
This is necessary when using the async sampler.
"""
def __init__(
self, observation_space, action_space, model, loss, loss_inputs):
def __init__(self, observation_space, action_space, model, loss,
loss_inputs):
"""Build a policy graph from policy and loss torch modules.
Note that module inputs will be CPU tensors. The model and loss modules
@@ -67,8 +67,8 @@ class TorchPolicyGraph(PolicyGraph):
"""Custom PyTorch optimizer to use."""
return torch.optim.Adam(self._model.parameters())
def compute_actions(
self, obs_batch, state_batches=None, is_training=False):
def compute_actions(self, obs_batch, state_batches=None,
is_training=False):
if state_batches:
raise NotImplementedError("Torch RNN support")
with self.lock:
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
global env_version_num
register(
id=env_name,
entry_point=(
"ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
"MultiAgentMountainCarEnv"),
max_episode_steps=200,
kwargs={}
)
id=env_name,
entry_point=(
"ray.rllib.examples.legacy_multiagent.multiagent_mountaincar_env:"
"MultiAgentMountainCarEnv"),
max_episode_steps=200,
kwargs={})
def create_env(env_config):
@@ -48,10 +47,12 @@ if __name__ == '__main__':
config["horizon"] = horizon
config["use_gae"] = False
config["model"].update({"fcnet_hiddens": [256, 256]})
options = {"multiagent_obs_shapes": [2, 2],
"multiagent_act_shapes": [1, 1],
"multiagent_shared_model": False,
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
options = {
"multiagent_obs_shapes": [2, 2],
"multiagent_act_shapes": [1, 1],
"multiagent_shared_model": False,
"multiagent_fcnet_hiddens": [[32, 32]] * 2
}
config["model"].update({"custom_options": options})
alg = ppo.PPOAgent(env=env_name, config=config)
for i in range(1):
@@ -2,7 +2,6 @@ from math import cos
from gym.spaces import Box, Tuple, Discrete
import numpy as np
from gym.envs.classic_control.mountain_car import MountainCarEnv
"""
Multiagent mountain car that sums and then
averages its actions to produce the velocity
@@ -22,8 +21,8 @@ class MultiAgentMountainCarEnv(MountainCarEnv):
self.viewer = None
self.action_space = [Discrete(3) for _ in range(2)]
self.observation_space = Tuple([
Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
self.observation_space = Tuple(
[Box(self.low, self.high, dtype=np.float32) for _ in range(2)])
self.seed()
self.reset()
@@ -20,13 +20,12 @@ def pass_params_to_gym(env_name):
global env_version_num
register(
id=env_name,
entry_point=(
"ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
"MultiAgentPendulumEnv"),
max_episode_steps=100,
kwargs={}
)
id=env_name,
entry_point=(
"ray.rllib.examples.legacy_multiagent.multiagent_pendulum_env:"
"MultiAgentPendulumEnv"),
max_episode_steps=100,
kwargs={})
def create_env(env_config):
@@ -49,10 +48,12 @@ if __name__ == '__main__':
config["horizon"] = horizon
config["use_gae"] = True
config["model"].update({"fcnet_hiddens": [256, 256]})
options = {"multiagent_obs_shapes": [3, 3],
"multiagent_act_shapes": [1, 1],
"multiagent_shared_model": True,
"multiagent_fcnet_hiddens": [[32, 32]] * 2}
options = {
"multiagent_obs_shapes": [3, 3],
"multiagent_act_shapes": [1, 1],
"multiagent_shared_model": True,
"multiagent_fcnet_hiddens": [[32, 32]] * 2
}
config["model"].update({"custom_options": options})
alg = ppo.PPOAgent(env=env_name, config=config)
for i in range(1):
@@ -2,7 +2,6 @@ from gym.spaces import Box, Tuple
from gym.utils import seeding
from gym.envs.classic_control.pendulum import PendulumEnv
import numpy as np
"""
Multiagent pendulum that sums its torques to generate an action
"""
@@ -10,8 +9,8 @@ import numpy as np
class MultiAgentPendulumEnv(PendulumEnv):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 30
}
def __init__(self):
@@ -21,13 +20,14 @@ class MultiAgentPendulumEnv(PendulumEnv):
self.viewer = None
high = np.array([1., 1., self.max_speed])
self.action_space = [Box(low=-self.max_torque / 2,
high=self.max_torque / 2,
shape=(1,),
dtype=np.float32)
for _ in range(2)]
self.observation_space = Tuple([
Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
self.action_space = [
Box(low=-self.max_torque / 2,
high=self.max_torque / 2,
shape=(1, ),
dtype=np.float32) for _ in range(2)
]
self.observation_space = Tuple(
[Box(low=-high, high=high, dtype=np.float32) for _ in range(2)])
self.seed()
@@ -49,8 +49,8 @@ class MultiAgentPendulumEnv(PendulumEnv):
costs = self.angle_normalize(th) ** 2 + .1 * thdot ** 2 + \
.001 * (summed_u ** 2)
newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) +
3. / (m * length ** 2) * summed_u) * dt
newthdot = thdot + (-3 * g / (2 * length) * np.sin(th + np.pi) + 3. /
(m * length**2) * summed_u) * dt
newth = th + newthdot * dt
newthdot = np.clip(newthdot, -self.max_speed, self.max_speed)
@@ -65,8 +65,10 @@ class MultiAgentPendulumEnv(PendulumEnv):
def _get_obs(self):
theta, thetadot = self.state
return [np.array([np.cos(theta), np.sin(theta), thetadot])
for _ in range(2)]
return [
np.array([np.cos(theta), np.sin(theta), thetadot])
for _ in range(2)
]
def angle_normalize(self, x):
return (((x + np.pi) % (2 * np.pi)) - np.pi)
@@ -1,7 +1,6 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Simple example of setting up a multi-agent policy mapping.
Control the number of agents and policies via --num-agents and --num-policies.
@@ -24,14 +23,12 @@ from ray.rllib.test.test_multi_agent_env import MultiCartpole
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
parser = argparse.ArgumentParser()
parser.add_argument("--num-agents", type=int, default=4)
parser.add_argument("--num-policies", type=int, default=2)
parser.add_argument("--num-iters", type=int, default=20)
if __name__ == "__main__":
args = parser.parse_args()
ray.init()
@@ -51,7 +48,8 @@ if __name__ == "__main__":
# Setup PG with an ensemble of `num_policies` different policy graphs
policy_graphs = {
"policy_{}".format(i): gen_policy() for i in range(args.num_policies)
"policy_{}".format(i): gen_policy()
for i in range(args.num_policies)
}
policy_ids = list(policy_graphs.keys())
@@ -1,7 +1,6 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Example of querying a policy server. Copy this file for your use case.
To try this out, in two separate shells run:
@@ -14,18 +13,19 @@ import gym
from ray.rllib.utils.policy_client import PolicyClient
parser = argparse.ArgumentParser()
parser.add_argument(
"--no-train", action="store_true", help="Whether to disable training.")
parser.add_argument(
"--off-policy", action="store_true",
"--off-policy",
action="store_true",
help="Whether to take random instead of on-policy actions.")
parser.add_argument(
"--stop-at-reward", type=int, default=9999,
"--stop-at-reward",
type=int,
default=9999,
help="Stop once the specified reward is reached.")
if __name__ == "__main__":
args = parser.parse_args()
env = gym.make("CartPole-v0")
@@ -1,7 +1,6 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""Example of running a policy server. Copy this file for your use case.
To try this out, in two separate shells run:
@@ -26,12 +25,12 @@ CHECKPOINT_FILE = "last_checkpoint.out"
class CartpoleServing(ServingEnv):
def __init__(self):
ServingEnv.__init__(
self, spaces.Discrete(2), spaces.Box(low=-10, high=10, shape=(4,)))
ServingEnv.__init__(self, spaces.Discrete(2),
spaces.Box(low=-10, high=10, shape=(4, )))
def run(self):
print("Starting policy server at {}:{}".format(
SERVER_ADDRESS, SERVER_PORT))
print("Starting policy server at {}:{}".format(SERVER_ADDRESS,
SERVER_PORT))
server = PolicyServer(self, SERVER_ADDRESS, SERVER_PORT)
server.serve_forever()
@@ -42,14 +41,16 @@ if __name__ == "__main__":
# We use DQN since it supports off-policy actions, but you can choose and
# configure any agent.
dqn = DQNAgent(env="srv", config={
# Use a single process to avoid needing to set up a load balancer
"num_workers": 0,
# Configure the agent to run short iterations for debugging
"exploration_fraction": 0.01,
"learning_starts": 100,
"timesteps_per_iteration": 200,
})
dqn = DQNAgent(
env="srv",
config={
# Use a single process to avoid needing to set up a load balancer
"num_workers": 0,
# Configure the agent to run short iterations for debugging
"exploration_fraction": 0.01,
"learning_starts": 100,
"timesteps_per_iteration": 200,
})
# Attempt to restore from checkpoint if possible.
if os.path.exists(CHECKPOINT_FILE):
+4 -4
View File
@@ -6,7 +6,7 @@ from ray.rllib.models.preprocessors import Preprocessor
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.lstm import LSTM
__all__ = ["ActionDistribution", "Categorical",
"DiagGaussian", "Deterministic", "ModelCatalog", "Model",
"Preprocessor", "FullyConnectedNetwork", "LSTM"]
__all__ = [
"ActionDistribution", "Categorical", "DiagGaussian", "Deterministic",
"ModelCatalog", "Model", "Preprocessor", "FullyConnectedNetwork", "LSTM"
]
+30 -26
View File
@@ -42,25 +42,25 @@ class Categorical(ActionDistribution):
logits=self.inputs, labels=x)
def entropy(self):
a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
keepdims=True)
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keepdims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), reduction_indices=[1])
def kl(self, other):
a0 = self.inputs - tf.reduce_max(self.inputs, reduction_indices=[1],
keepdims=True)
a1 = other.inputs - tf.reduce_max(other.inputs, reduction_indices=[1],
keepdims=True)
a0 = self.inputs - tf.reduce_max(
self.inputs, reduction_indices=[1], keepdims=True)
a1 = other.inputs - tf.reduce_max(
other.inputs, reduction_indices=[1], keepdims=True)
ea0 = tf.exp(a0)
ea1 = tf.exp(a1)
z0 = tf.reduce_sum(ea0, reduction_indices=[1], keepdims=True)
z1 = tf.reduce_sum(ea1, reduction_indices=[1], keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)),
reduction_indices=[1])
return tf.reduce_sum(
p0 * (a0 - tf.log(z0) - a1 + tf.log(z1)), reduction_indices=[1])
def sample(self):
return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
@@ -90,22 +90,23 @@ class DiagGaussian(ActionDistribution):
self.std = tf.exp(log_std)
def logp(self, x):
return (-0.5 * tf.reduce_sum(tf.square((x - self.mean) / self.std),
reduction_indices=[1]) -
return (-0.5 * tf.reduce_sum(
tf.square((x - self.mean) / self.std), reduction_indices=[1]) -
0.5 * np.log(2.0 * np.pi) * tf.to_float(tf.shape(x)[1]) -
tf.reduce_sum(self.log_std, reduction_indices=[1]))
def kl(self, other):
assert isinstance(other, DiagGaussian)
return tf.reduce_sum(other.log_std - self.log_std +
(tf.square(self.std) +
tf.square(self.mean - other.mean)) /
(2.0 * tf.square(other.std)) - 0.5,
reduction_indices=[1])
return tf.reduce_sum(
other.log_std - self.log_std +
(tf.square(self.std) + tf.square(self.mean - other.mean)) /
(2.0 * tf.square(other.std)) - 0.5,
reduction_indices=[1])
def entropy(self):
return tf.reduce_sum(self.log_std + .5 * np.log(2.0 * np.pi * np.e),
reduction_indices=[1])
return tf.reduce_sum(
self.log_std + .5 * np.log(2.0 * np.pi * np.e),
reduction_indices=[1])
def sample(self):
out = self.mean + self.std * tf.random_normal(tf.shape(self.mean))
@@ -158,6 +159,7 @@ class MultiActionDistribution(ActionDistribution):
Args:
inputs (Tensor list): A list of tensors from which to compute samples.
"""
def __init__(self, inputs, action_space, child_distributions):
# you actually have to instantiate the child distributions
self.reshaper = Reshaper(action_space.spaces)
@@ -174,23 +176,25 @@ class MultiActionDistribution(ActionDistribution):
# Remove extra categorical dimension
if isinstance(distribution, Categorical):
split_list[i] = tf.squeeze(split_list[i], axis=-1)
log_list = np.asarray([distribution.logp(split_x) for
distribution, split_x in
zip(self.child_distributions, split_list)])
log_list = np.asarray([
distribution.logp(split_x) for distribution, split_x in zip(
self.child_distributions, split_list)
])
return np.sum(log_list)
def kl(self, other):
"""The KL-divergence between two action distributions."""
kl_list = np.asarray([distribution.kl(other_distribution) for
distribution, other_distribution in
zip(self.child_distributions,
other.child_distributions)])
kl_list = np.asarray([
distribution.kl(other_distribution)
for distribution, other_distribution in zip(
self.child_distributions, other.child_distributions)
])
return np.sum(kl_list)
def entropy(self):
"""The entropy of the action distribution."""
entropy_list = np.array([s.entropy() for s in
self.child_distributions])
entropy_list = np.array(
[s.entropy() for s in self.child_distributions])
return np.sum(entropy_list)
def sample(self):
+38 -33
View File
@@ -19,7 +19,6 @@ from ray.rllib.models.visionnet import VisionNetwork
from ray.rllib.models.lstm import LSTM
from ray.rllib.models.multiagentfcnet import MultiAgentFullyConnectedNetwork
MODEL_CONFIGS = [
# === Built-in options ===
"conv_filters", # Filter configuration
@@ -30,11 +29,9 @@ MODEL_CONFIGS = [
"grayscale", # Converts ATARI frame to 1 Channel Grayscale image
"zero_mean", # Changes frame to range from [-1, 1] if true
"extra_frameskip", # (int) for number of frames to skip
"free_log_std", # Documented in ray.rllib.models.Model
"channel_major", # Pytorch conv requires images to be channel-major
"squash_to_range", # Whether to squash the action output to space range
"use_lstm", # Whether to wrap the model with a LSTM
"max_seq_len", # Max seq len for training the LSTM, defaults to 20
"lstm_cell_size", # Size of the LSTM cell
@@ -81,8 +78,8 @@ class ModelCatalog(object):
if dist_type is None:
dist = DiagGaussian
if config.get("squash_to_range"):
dist = squash_to_range(
dist, action_space.low, action_space.high)
dist = squash_to_range(dist, action_space.low,
action_space.high)
return dist, action_space.shape[0] * 2
elif dist_type == 'deterministic':
return Deterministic, action_space.shape[0]
@@ -95,12 +92,13 @@ class ModelCatalog(object):
dist, action_size = ModelCatalog.get_action_dist(action)
child_dist.append(dist)
size += action_size
return partial(MultiActionDistribution,
child_distributions=child_dist,
action_space=action_space), size
return partial(
MultiActionDistribution,
child_distributions=child_dist,
action_space=action_space), size
raise NotImplementedError(
"Unsupported args: {} {}".format(action_space, dist_type))
raise NotImplementedError("Unsupported args: {} {}".format(
action_space, dist_type))
@staticmethod
def get_action_placeholder(action_space):
@@ -120,7 +118,7 @@ class ModelCatalog(object):
return tf.placeholder(
tf.float32, shape=(None, action_space.shape[0]), name="action")
elif isinstance(action_space, gym.spaces.Discrete):
return tf.placeholder(tf.int64, shape=(None,), name="action")
return tf.placeholder(tf.int64, shape=(None, ), name="action")
elif isinstance(action_space, gym.spaces.Tuple):
size = 0
all_discrete = True
@@ -131,15 +129,19 @@ class ModelCatalog(object):
all_discrete = False
size += np.product(action_space.spaces[i].shape)
return tf.placeholder(
tf.int64 if all_discrete else tf.float32, shape=(None, size),
tf.int64 if all_discrete else tf.float32,
shape=(None, size),
name="action")
else:
raise NotImplementedError("action space {}"
" not supported".format(action_space))
@staticmethod
def get_model(
inputs, num_outputs, options=None, state_in=None, seq_lens=None):
def get_model(inputs,
num_outputs,
options=None,
state_in=None,
seq_lens=None):
"""Returns a suitable model conforming to given input and output specs.
Args:
@@ -154,12 +156,12 @@ class ModelCatalog(object):
"""
options = options or {}
model = ModelCatalog._get_model(
inputs, num_outputs, options, state_in, seq_lens)
model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
seq_lens)
if options.get("use_lstm"):
model = LSTM(
model.last_layer, num_outputs, options, state_in, seq_lens)
model = LSTM(model.last_layer, num_outputs, options, state_in,
seq_lens)
return model
@@ -169,16 +171,20 @@ class ModelCatalog(object):
model = options["custom_model"]
print("Using custom model {}".format(model))
return _global_registry.get(RLLIB_MODEL, model)(
inputs, num_outputs, options,
state_in=state_in, seq_lens=seq_lens)
inputs,
num_outputs,
options,
state_in=state_in,
seq_lens=seq_lens)
obs_rank = len(inputs.shape) - 1
# num_outputs > 1 used to avoid hitting this with the value function
if isinstance(options.get("custom_options", {}).get(
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
return MultiAgentFullyConnectedNetwork(
inputs, num_outputs, options)
if isinstance(
options.get("custom_options", {}).get(
"multiagent_fcnet_hiddens", 1), list) and num_outputs > 1:
return MultiAgentFullyConnectedNetwork(inputs, num_outputs,
options)
if obs_rank > 1:
return VisionNetwork(inputs, num_outputs, options)
@@ -198,10 +204,10 @@ class ModelCatalog(object):
Returns:
model (Model): Neural network model.
"""
from ray.rllib.models.pytorch.fcnet import (
FullyConnectedNetwork as PyTorchFCNet)
from ray.rllib.models.pytorch.visionnet import (
VisionNetwork as PyTorchVisionNet)
from ray.rllib.models.pytorch.fcnet import (FullyConnectedNetwork as
PyTorchFCNet)
from ray.rllib.models.pytorch.visionnet import (VisionNetwork as
PyTorchVisionNet)
if "custom_model" in options:
model = options["custom_model"]
@@ -232,9 +238,8 @@ class ModelCatalog(object):
"""
for k in options.keys():
if k not in MODEL_CONFIGS:
raise Exception(
"Unknown config key `{}`, all keys: {}".format(
k, MODEL_CONFIGS))
raise Exception("Unknown config key `{}`, all keys: {}".format(
k, MODEL_CONFIGS))
if "custom_preprocessor" in options:
preprocessor = options["custom_preprocessor"]
@@ -271,8 +276,8 @@ class ModelCatalog(object):
preprocessor_name (str): Name to register the preprocessor under.
preprocessor_class (type): Python class of the preprocessor.
"""
_global_registry.register(
RLLIB_PREPROCESSOR, preprocessor_name, preprocessor_class)
_global_registry.register(RLLIB_PREPROCESSOR, preprocessor_name,
preprocessor_class)
@staticmethod
def register_custom_model(model_name, model_class):
+6 -3
View File
@@ -22,14 +22,17 @@ class FullyConnectedNetwork(Model):
for size in hiddens:
label = "fc{}".format(i)
last_layer = slim.fully_connected(
last_layer, size,
last_layer,
size,
weights_initializer=normc_initializer(1.0),
activation_fn=activation,
scope=label)
i += 1
label = "fc_out"
output = slim.fully_connected(
last_layer, num_outputs,
last_layer,
num_outputs,
weights_initializer=normc_initializer(0.01),
activation_fn=None, scope=label)
activation_fn=None,
scope=label)
return output, last_layer
+14 -13
View File
@@ -1,7 +1,6 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
"""LSTM support for RLlib.
The main trick here is that we add the time dimension at the last moment.
@@ -14,7 +13,6 @@ See the add_time_dimension() and chop_into_sequences() functions below for
more info.
"""
import numpy as np
import tensorflow as tf
import tensorflow.contrib.rnn as rnn
@@ -46,14 +44,13 @@ def add_time_dimension(padded_inputs, seq_lens):
# Dynamically reshape the padded batch to introduce a time dimension.
new_batch_size = padded_batch_size // max_seq_len
new_shape = (
[new_batch_size, max_seq_len] +
padded_inputs.get_shape().as_list()[1:])
new_shape = ([new_batch_size, max_seq_len] +
padded_inputs.get_shape().as_list()[1:])
return tf.reshape(padded_inputs, new_shape)
def chop_into_sequences(
time_column, feature_columns, state_columns, max_seq_len):
def chop_into_sequences(time_column, feature_columns, state_columns,
max_seq_len):
"""Truncate and pad experiences into fixed-length sequences.
Arguments:
@@ -106,7 +103,7 @@ def chop_into_sequences(
feature_sequences = []
for f in feature_columns:
f = np.array(f)
f_pad = np.zeros((len(seq_lens) * max_seq_len,) + np.shape(f)[1:])
f_pad = np.zeros((len(seq_lens) * max_seq_len, ) + np.shape(f)[1:])
seq_base = 0
i = 0
for l in seq_lens:
@@ -152,7 +149,8 @@ class LSTM(Model):
lstm = rnn.rnn_cell.BasicLSTMCell(cell_size, state_is_tuple=True)
self.state_init = [
np.zeros(lstm.state_size.c, np.float32),
np.zeros(lstm.state_size.h, np.float32)]
np.zeros(lstm.state_size.h, np.float32)
]
# Setup LSTM inputs
if self.state_in:
@@ -170,12 +168,15 @@ class LSTM(Model):
else:
state_in = rnn.rnn_cell.LSTMStateTuple(c_in, h_in)
lstm_out, lstm_state = tf.nn.dynamic_rnn(
lstm, last_layer, initial_state=state_in,
sequence_length=self.seq_lens, time_major=False)
lstm,
last_layer,
initial_state=state_in,
sequence_length=self.seq_lens,
time_major=False)
self.state_out = list(lstm_state)
# Compute outputs
last_layer = tf.reshape(lstm_out, [-1, cell_size])
logits = linear(
last_layer, num_outputs, "action", normc_initializer(0.01))
logits = linear(last_layer, num_outputs, "action",
normc_initializer(0.01))
return logits, last_layer
+27 -14
View File
@@ -11,6 +11,7 @@ def normc_initializer(std=1.0):
out = np.random.randn(*shape).astype(np.float32)
out *= std / np.sqrt(np.square(out).sum(axis=0, keepdims=True))
return tf.constant(out)
return _initializer
@@ -18,12 +19,20 @@ def get_activation_fn(name):
return getattr(tf.nn, name)
def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
dtype=tf.float32, collections=None):
def conv2d(x,
num_filters,
name,
filter_size=(3, 3),
stride=(1, 1),
pad="SAME",
dtype=tf.float32,
collections=None):
with tf.variable_scope(name):
stride_shape = [1, stride[0], stride[1], 1]
filter_shape = [filter_size[0], filter_size[1], int(x.get_shape()[3]),
num_filters]
filter_shape = [
filter_size[0], filter_size[1],
int(x.get_shape()[3]), num_filters
]
# There are "num input feature maps * filter height * filter width"
# inputs to each hidden unit.
@@ -34,20 +43,24 @@ def conv2d(x, num_filters, name, filter_size=(3, 3), stride=(1, 1), pad="SAME",
# Initialize weights with random weights.
w_bound = np.sqrt(6 / (fan_in + fan_out))
w = tf.get_variable("W", filter_shape, dtype,
tf.random_uniform_initializer(-w_bound, w_bound),
collections=collections)
b = tf.get_variable("b", [1, 1, 1, num_filters],
initializer=tf.constant_initializer(0.0),
collections=collections)
w = tf.get_variable(
"W",
filter_shape,
dtype,
tf.random_uniform_initializer(-w_bound, w_bound),
collections=collections)
b = tf.get_variable(
"b", [1, 1, 1, num_filters],
initializer=tf.constant_initializer(0.0),
collections=collections)
return tf.nn.conv2d(x, w, stride_shape, pad) + b
def linear(x, size, name, initializer=None, bias_init=0):
w = tf.get_variable(name + "/w", [x.get_shape()[1], size],
initializer=initializer)
b = tf.get_variable(name + "/b", [size],
initializer=tf.constant_initializer(bias_init))
w = tf.get_variable(
name + "/w", [x.get_shape()[1], size], initializer=initializer)
b = tf.get_variable(
name + "/b", [size], initializer=tf.constant_initializer(bias_init))
return tf.matmul(x, w) + b
+10 -4
View File
@@ -37,8 +37,12 @@ class Model(object):
a scale parameter (like a standard deviation).
"""
def __init__(
self, inputs, num_outputs, options, state_in=None, seq_lens=None):
def __init__(self,
inputs,
num_outputs,
options,
state_in=None,
seq_lens=None):
self.inputs = inputs
# Default attribute values for the non-RNN case
@@ -57,8 +61,10 @@ class Model(object):
self.outputs, self.last_layer = self._build_layers(
inputs, num_outputs, options)
if options.get("free_log_std", False):
log_std = tf.get_variable(name="log_std", shape=[num_outputs],
initializer=tf.zeros_initializer)
log_std = tf.get_variable(
name="log_std",
shape=[num_outputs],
initializer=tf.zeros_initializer)
self.outputs = tf.concat(
[self.outputs, 0.0 * self.outputs + log_std], 1)
+3 -3
View File
@@ -23,7 +23,7 @@ class MultiAgentFullyConnectedNetwork(Model):
custom_options = options["custom_options"]
hiddens = custom_options.get("multiagent_fcnet_hiddens",
[[256, 256]]*1)
[[256, 256]] * 1)
# check for a shared model
shared_model = custom_options.get("multiagent_shared_model", 0)
@@ -35,8 +35,8 @@ class MultiAgentFullyConnectedNetwork(Model):
sub_options = options.copy()
sub_options.update({"fcnet_hiddens": hiddens[i]})
# TODO(ev) make this support arbitrary networks
fcnet = FullyConnectedNetwork(
split_inputs[i], int(num_actions[i]), sub_options)
fcnet = FullyConnectedNetwork(split_inputs[i],
int(num_actions[i]), sub_options)
output = fcnet.outputs
outputs.append(output)
overall_output = tf.concat(outputs, axis=1)
+6 -5
View File
@@ -6,7 +6,7 @@ import numpy as np
import gym
ATARI_OBS_SHAPE = (210, 160, 3)
ATARI_RAM_OBS_SHAPE = (128,)
ATARI_RAM_OBS_SHAPE = (128, )
class Preprocessor(object):
@@ -70,7 +70,7 @@ class AtariPixelPreprocessor(Preprocessor):
class AtariRamPreprocessor(Preprocessor):
def _init(self):
self.shape = (128,)
self.shape = (128, )
def transform(self, observation):
return (observation - 128) / 128
@@ -78,7 +78,7 @@ class AtariRamPreprocessor(Preprocessor):
class OneHotPreprocessor(Preprocessor):
def _init(self):
self.shape = (self._obs_space.n,)
self.shape = (self._obs_space.n, )
def transform(self, observation):
arr = np.zeros(self._obs_space.n)
@@ -111,13 +111,14 @@ class TupleFlatteningPreprocessor(Preprocessor):
preprocessor = get_preprocessor(space)(space, self._options)
self.preprocessors.append(preprocessor)
size += np.product(preprocessor.shape)
self.shape = (size,)
self.shape = (size, )
def transform(self, observation):
assert len(observation) == len(self.preprocessors), observation
return np.concatenate([
np.reshape(p.transform(o), [np.product(p.shape)])
for (o, p) in zip(observation, self.preprocessors)])
for (o, p) in zip(observation, self.preprocessors)
])
def get_preprocessor(space):
+19 -6
View File
@@ -22,14 +22,27 @@ class VisionNetwork(Model):
with tf.name_scope("vision_net"):
for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
inputs = slim.conv2d(
inputs, out_size, kernel, stride,
activation_fn=activation, scope="conv{}".format(i))
inputs,
out_size,
kernel,
stride,
activation_fn=activation,
scope="conv{}".format(i))
out_size, kernel, stride = filters[-1]
fc1 = slim.conv2d(
inputs, out_size, kernel, stride,
activation_fn=activation, padding="VALID", scope="fc1")
fc2 = slim.conv2d(fc1, num_outputs, [1, 1], activation_fn=None,
normalizer_fn=None, scope="fc2")
inputs,
out_size,
kernel,
stride,
activation_fn=activation,
padding="VALID",
scope="fc1")
fc2 = slim.conv2d(
fc1,
num_outputs, [1, 1],
activation_fn=None,
normalizer_fn=None,
scope="fc2")
return flatten(fc2), flatten(fc1)
-1
View File
@@ -6,7 +6,6 @@ from ray.rllib.optimizers.sync_samples_optimizer import SyncSamplesOptimizer
from ray.rllib.optimizers.sync_replay_optimizer import SyncReplayOptimizer
from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer
__all__ = [
"PolicyOptimizer", "AsyncSamplesOptimizer", "AsyncGradientsOptimizer",
"SyncSamplesOptimizer", "SyncReplayOptimizer", "LocalMultiGPUOptimizer"
@@ -14,6 +14,7 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
evaluators, sending updated weights back as needed. This pipelines the
gradient computations on the remote workers.
"""
def _init(self, grads_per_step=100):
self.apply_timer = TimerStat()
self.wait_timer = TimerStat()
@@ -55,8 +56,9 @@ class AsyncGradientsOptimizer(PolicyOptimizer):
num_gradients += 1
def stats(self):
return dict(PolicyOptimizer.stats(self), **{
"wait_time_ms": round(1000 * self.wait_timer.mean, 3),
"apply_time_ms": round(1000 * self.apply_timer.mean, 3),
"dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
})
return dict(
PolicyOptimizer.stats(self), **{
"wait_time_ms": round(1000 * self.wait_timer.mean, 3),
"apply_time_ms": round(1000 * self.apply_timer.mean, 3),
"dispatch_time_ms": round(1000 * self.dispatch_timer.mean, 3),
})
@@ -22,7 +22,6 @@ from ray.rllib.utils.actors import TaskPool, create_colocated
from ray.rllib.utils.timer import TimerStat
from ray.rllib.utils.window_stat import WindowStat
SAMPLE_QUEUE_DEPTH = 2
REPLAY_QUEUE_DEPTH = 4
LEARNER_QUEUE_MAX_SIZE = 16
@@ -35,10 +34,10 @@ class ReplayActor(object):
Ray actors are single-threaded, so for scalability multiple replay actors
may be created to increase parallelism."""
def __init__(
self, num_shards, learning_starts, buffer_size, train_batch_size,
prioritized_replay_alpha, prioritized_replay_beta,
prioritized_replay_eps, clip_rewards):
def __init__(self, num_shards, learning_starts, buffer_size,
train_batch_size, prioritized_replay_alpha,
prioritized_replay_beta, prioritized_replay_eps,
clip_rewards):
self.replay_starts = learning_starts // num_shards
self.buffer_size = buffer_size // num_shards
self.train_batch_size = train_batch_size
@@ -46,7 +45,8 @@ class ReplayActor(object):
self.prioritized_replay_eps = prioritized_replay_eps
self.replay_buffer = PrioritizedReplayBuffer(
self.buffer_size, alpha=prioritized_replay_alpha,
self.buffer_size,
alpha=prioritized_replay_alpha,
clip_rewards=clip_rewards)
# Metrics
@@ -60,38 +60,39 @@ class ReplayActor(object):
def add_batch(self, batch):
with self.add_batch_timer:
for row in batch.rows():
self.replay_buffer.add(
row["obs"], row["actions"], row["rewards"], row["new_obs"],
row["dones"], row["weights"])
self.replay_buffer.add(row["obs"], row["actions"],
row["rewards"], row["new_obs"],
row["dones"], row["weights"])
def replay(self):
with self.replay_timer:
if len(self.replay_buffer) < self.replay_starts:
return None
(obses_t, actions, rewards, obses_tp1,
dones, weights, batch_indexes) = self.replay_buffer.sample(
self.train_batch_size,
beta=self.prioritized_replay_beta)
(obses_t, actions, rewards, obses_tp1, dones, weights,
batch_indexes) = self.replay_buffer.sample(
self.train_batch_size, beta=self.prioritized_replay_beta)
batch = SampleBatch({
"obs": obses_t, "actions": actions, "rewards": rewards,
"new_obs": obses_tp1, "dones": dones, "weights": weights,
"batch_indexes": batch_indexes})
"obs": obses_t,
"actions": actions,
"rewards": rewards,
"new_obs": obses_tp1,
"dones": dones,
"weights": weights,
"batch_indexes": batch_indexes
})
return batch
def update_priorities(self, batch_indexes, td_errors):
with self.update_priorities_timer:
new_priorities = (
np.abs(td_errors) + self.prioritized_replay_eps)
new_priorities = (np.abs(td_errors) + self.prioritized_replay_eps)
self.replay_buffer.update_priorities(batch_indexes, new_priorities)
def stats(self):
stat = {
"add_batch_time_ms": round(
1000 * self.add_batch_timer.mean, 3),
"replay_time_ms": round(
1000 * self.replay_timer.mean, 3),
"add_batch_time_ms": round(1000 * self.add_batch_timer.mean, 3),
"replay_time_ms": round(1000 * self.replay_timer.mean, 3),
"update_priorities_time_ms": round(
1000 * self.update_priorities_timer.mean, 3),
}
@@ -145,13 +146,19 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
"td_error" array in the info return of compute_gradients(). This error
term will be used for sample prioritization."""
def _init(
self, learning_starts=1000, buffer_size=10000,
prioritized_replay=True, prioritized_replay_alpha=0.6,
prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
train_batch_size=512, sample_batch_size=50,
num_replay_buffer_shards=1, max_weight_sync_delay=400,
clip_rewards=True, debug=False):
def _init(self,
learning_starts=1000,
buffer_size=10000,
prioritized_replay=True,
prioritized_replay_alpha=0.6,
prioritized_replay_beta=0.4,
prioritized_replay_eps=1e-6,
train_batch_size=512,
sample_batch_size=50,
num_replay_buffer_shards=1,
max_weight_sync_delay=400,
clip_rewards=True,
debug=False):
self.debug = debug
self.replay_starts = learning_starts
@@ -164,18 +171,21 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
self.learner = LearnerThread(self.local_evaluator)
self.learner.start()
self.replay_actors = create_colocated(
ReplayActor,
[num_replay_buffer_shards, learning_starts, buffer_size,
train_batch_size, prioritized_replay_alpha,
prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
num_replay_buffer_shards)
self.replay_actors = create_colocated(ReplayActor, [
num_replay_buffer_shards, learning_starts, buffer_size,
train_batch_size, prioritized_replay_alpha,
prioritized_replay_beta, prioritized_replay_eps, clip_rewards
], num_replay_buffer_shards)
assert len(self.remote_evaluators) > 0
# Stats
self.timers = {k: TimerStat() for k in [
"put_weights", "get_samples", "enqueue", "sample_processing",
"replay_processing", "update_priorities", "train", "sample"]}
self.timers = {
k: TimerStat()
for k in [
"put_weights", "get_samples", "enqueue", "sample_processing",
"replay_processing", "update_priorities", "train", "sample"
]
}
self.num_weight_syncs = 0
self.learning_started = False
@@ -221,8 +231,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
sample_timesteps += self.sample_batch_size
# Send the data to the replay buffer
random.choice(self.replay_actors).add_batch.remote(
sample_batch)
random.choice(
self.replay_actors).add_batch.remote(sample_batch)
# Update weights if needed
self.steps_since_update[ev] += self.sample_batch_size
@@ -268,8 +278,8 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
timing["learner_dequeue_time_ms"] = round(
1000 * self.learner.queue_timer.mean, 3)
stats = {
"sample_throughput": round(
self.timers["sample"].mean_throughput, 3),
"sample_throughput": round(self.timers["sample"].mean_throughput,
3),
"train_throughput": round(self.timers["train"].mean_throughput, 3),
"num_weight_syncs": self.num_weight_syncs,
}
+21 -20
View File
@@ -6,7 +6,6 @@ from collections import namedtuple
import tensorflow as tf
# Variable scope in which created variables will be placed under
TOWER_SCOPE_NAME = "tower"
@@ -47,8 +46,14 @@ class LocalSyncParallelOptimizer(object):
grad_norm_clipping: None or int stdev to clip grad norms by
"""
def __init__(self, optimizer, devices, input_placeholders, rnn_inputs,
per_device_batch_size, build_graph, logdir,
def __init__(self,
optimizer,
devices,
input_placeholders,
rnn_inputs,
per_device_batch_size,
build_graph,
logdir,
grad_norm_clipping=None):
# TODO(rliaw): remove logdir
self.optimizer = optimizer
@@ -78,8 +83,8 @@ class LocalSyncParallelOptimizer(object):
self._towers = []
for device, device_placeholders in zip(self.devices, data_splits):
self._towers.append(
self._setup_device(
device, device_placeholders, len(input_placeholders)))
self._setup_device(device, device_placeholders,
len(input_placeholders)))
avg = average_gradients([t.grads for t in self._towers])
if grad_norm_clipping:
@@ -119,14 +124,10 @@ class LocalSyncParallelOptimizer(object):
assert len(state_inputs[0]) * seq_len == len(inputs[0])
# Make sure the shorter state inputs arrays are evenly divisible
state_inputs = [
make_divisible_by(arr, self.batch_size)
for arr in state_inputs
make_divisible_by(arr, self.batch_size) for arr in state_inputs
]
# Then truncate the data inputs to match
inputs = [
arr[:len(state_inputs[0]) * seq_len]
for arr in inputs
]
inputs = [arr[:len(state_inputs[0]) * seq_len] for arr in inputs]
assert len(state_inputs[0]) * seq_len == len(inputs[0])
assert len(state_inputs[0]) % self.batch_size == 0
for ph, arr in zip(self.loss_inputs, inputs + state_inputs):
@@ -138,8 +139,7 @@ class LocalSyncParallelOptimizer(object):
feed_dict[ph] = truncated_arr
truncated_len = len(truncated_arr)
sess.run(
[t.init_op for t in self._towers], feed_dict=feed_dict)
sess.run([t.init_op for t in self._towers], feed_dict=feed_dict)
tuples_per_device = truncated_len / len(self.devices)
assert tuples_per_device > 0, \
@@ -198,7 +198,9 @@ class LocalSyncParallelOptimizer(object):
device_input_slices = []
for i, ph in enumerate(device_input_placeholders):
current_batch = tf.Variable(
ph, trainable=False, validate_shape=False,
ph,
trainable=False,
validate_shape=False,
collections=[])
device_input_batches.append(current_batch)
if i < num_data_in:
@@ -210,18 +212,17 @@ class LocalSyncParallelOptimizer(object):
current_slice = tf.slice(
current_batch,
([self._batch_index // scale * granularity] +
[0] * len(ph.shape[1:])),
[0] * len(ph.shape[1:])),
([self.per_device_batch_size // scale * granularity] +
[-1] * len(ph.shape[1:])))
[-1] * len(ph.shape[1:])))
current_slice.set_shape(ph.shape)
device_input_slices.append(current_slice)
graph_obj = self.build_graph(device_input_slices)
device_grads = graph_obj.gradients(self.optimizer)
return Tower(
tf.group(*[batch.initializer
for batch in device_input_batches]),
device_grads,
graph_obj)
tf.group(
*[batch.initializer for batch in device_input_batches]),
device_grads, graph_obj)
# Each tower is a copy of the loss graph pinned to a specific device.
@@ -30,8 +30,12 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
may result in unexpected behavior.
"""
def _init(self, sgd_batch_size=128, sgd_stepsize=5e-5, num_sgd_iter=10,
timesteps_per_batch=1024, standardize_fields=[]):
def _init(self,
sgd_batch_size=128,
sgd_stepsize=5e-5,
num_sgd_iter=10,
timesteps_per_batch=1024,
standardize_fields=[]):
self.batch_size = sgd_batch_size
self.sgd_stepsize = sgd_stepsize
self.num_sgd_iter = num_sgd_iter
@@ -41,8 +45,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
self.devices = ["/cpu:0"]
else:
self.devices = ["/gpu:{}".format(i) for i in range(len(gpu_ids))]
self.batch_size = int(
sgd_batch_size / len(self.devices)) * len(self.devices)
self.batch_size = int(sgd_batch_size / len(self.devices)) * len(
self.devices)
assert self.batch_size % len(self.devices) == 0
assert self.batch_size >= len(self.devices), "batch size too small"
self.per_device_batch_size = int(self.batch_size / len(self.devices))
@@ -70,16 +74,15 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
with tf.variable_scope("default", reuse=tf.AUTO_REUSE):
if self.policy._state_inputs:
rnn_inputs = self.policy._state_inputs + [
self.policy._seq_lens]
self.policy._seq_lens
]
else:
rnn_inputs = []
self.par_opt = LocalSyncParallelOptimizer(
tf.train.AdamOptimizer(self.sgd_stepsize),
self.devices,
[v for _, v in self.policy.loss_inputs()],
rnn_inputs,
self.per_device_batch_size,
self.policy.copy,
tf.train.AdamOptimizer(
self.sgd_stepsize), self.devices,
[v for _, v in self.policy.loss_inputs()], rnn_inputs,
self.per_device_batch_size, self.policy.copy,
os.getcwd())
self.sess = self.local_evaluator.tf_sess
@@ -117,8 +120,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
else:
state_keys = []
tuples_per_device = self.par_opt.load_data(
self.sess,
[tuples[k] for k in data_keys],
self.sess, [tuples[k] for k in data_keys],
[tuples[k] for k in state_keys])
with self.grad_timer:
@@ -141,12 +143,14 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
return _averaged(iter_extra_fetches)
def stats(self):
return dict(PolicyOptimizer.stats(self), **{
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
"load_time_ms": round(1000 * self.load_timer.mean, 3),
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
"update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
})
return dict(
PolicyOptimizer.stats(self), **{
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
"load_time_ms": round(1000 * self.load_timer.mean, 3),
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
"update_time_ms": round(1000 * self.update_weights_timer.mean,
3),
})
def _averaged(kv):
@@ -103,9 +103,10 @@ class PolicyOptimizer(object):
"""
local_result = [func(self.local_evaluator, 0)]
remote_results = ray.get(
[ev.apply.remote(func, i + 1)
for i, ev in enumerate(self.remote_evaluators)])
remote_results = ray.get([
ev.apply.remote(func, i + 1)
for i, ev in enumerate(self.remote_evaluators)
])
return local_result + remote_results
def collect_metrics(self):
+13 -11
View File
@@ -90,8 +90,10 @@ class ReplayBuffer(object):
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
"""
idxes = [random.randint(0, len(self._storage) - 1)
for _ in range(batch_size)]
idxes = [
random.randint(0,
len(self._storage) - 1) for _ in range(batch_size)
]
self._num_sampled += batch_size
return self._encode_sample(idxes)
@@ -142,12 +144,12 @@ class PrioritizedReplayBuffer(ReplayBuffer):
reward = np.sign(reward)
idx = self._next_idx
super(PrioritizedReplayBuffer, self).add(
obs_t, action, reward, obs_tp1, done, weight)
super(PrioritizedReplayBuffer, self).add(obs_t, action, reward,
obs_tp1, done, weight)
if weight is None:
weight = self._max_priority
self._it_sum[idx] = weight ** self._alpha
self._it_min[idx] = weight ** self._alpha
self._it_sum[idx] = weight**self._alpha
self._it_min[idx] = weight**self._alpha
def _sample_proportional(self, batch_size):
res = []
@@ -202,11 +204,11 @@ class PrioritizedReplayBuffer(ReplayBuffer):
weights = []
p_min = self._it_min.min() / self._it_sum.sum()
max_weight = (p_min * len(self._storage)) ** (-beta)
max_weight = (p_min * len(self._storage))**(-beta)
for idx in idxes:
p_sample = self._it_sum[idx] / self._it_sum.sum()
weight = (p_sample * len(self._storage)) ** (-beta)
weight = (p_sample * len(self._storage))**(-beta)
weights.append(weight / max_weight)
weights = np.array(weights)
encoded_sample = self._encode_sample(idxes)
@@ -231,10 +233,10 @@ class PrioritizedReplayBuffer(ReplayBuffer):
for idx, priority in zip(idxes, priorities):
assert priority > 0
assert 0 <= idx < len(self._storage)
delta = priority ** self._alpha - self._it_sum[idx]
delta = priority**self._alpha - self._it_sum[idx]
self._prio_change_stats.push(delta)
self._it_sum[idx] = priority ** self._alpha
self._it_min[idx] = priority ** self._alpha
self._it_sum[idx] = priority**self._alpha
self._it_min[idx] = priority**self._alpha
self._max_priority = max(self._max_priority, priority)
+5 -11
View File
@@ -54,8 +54,7 @@ class SegmentTree(object):
return self._operation(
self._reduce_helper(start, mid, 2 * node, node_start, mid),
self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1,
node_end)
)
node_end))
def reduce(self, start=0, end=None):
"""Returns result of applying `self.operation`
@@ -89,9 +88,8 @@ class SegmentTree(object):
self._value[idx] = val
idx //= 2
while idx >= 1:
self._value[idx] = self._operation(
self._value[2 * idx],
self._value[2 * idx + 1])
self._value[idx] = self._operation(self._value[2 * idx],
self._value[2 * idx + 1])
idx //= 2
def __getitem__(self, idx):
@@ -102,9 +100,7 @@ class SegmentTree(object):
class SumSegmentTree(SegmentTree):
def __init__(self, capacity):
super(SumSegmentTree, self).__init__(
capacity=capacity,
operation=operator.add,
neutral_element=0.0)
capacity=capacity, operation=operator.add, neutral_element=0.0)
def sum(self, start=0, end=None):
"""Returns arr[start] + ... + arr[end]"""
@@ -142,9 +138,7 @@ class SumSegmentTree(SegmentTree):
class MinSegmentTree(SegmentTree):
def __init__(self, capacity):
super(MinSegmentTree, self).__init__(
capacity=capacity,
operation=min,
neutral_element=float('inf'))
capacity=capacity, operation=min, neutral_element=float('inf'))
def min(self, start=0, end=None):
"""Returns min(arr[start], ..., arr[end])"""
@@ -23,11 +23,16 @@ class SyncReplayOptimizer(PolicyOptimizer):
"td_error" array in the info return of compute_gradients(). This error
term will be used for sample prioritization."""
def _init(
self, learning_starts=1000, buffer_size=10000,
prioritized_replay=True, prioritized_replay_alpha=0.6,
prioritized_replay_beta=0.4, prioritized_replay_eps=1e-6,
train_batch_size=32, sample_batch_size=4, clip_rewards=True):
def _init(self,
learning_starts=1000,
buffer_size=10000,
prioritized_replay=True,
prioritized_replay_alpha=0.6,
prioritized_replay_beta=0.4,
prioritized_replay_eps=1e-6,
train_batch_size=32,
sample_batch_size=4,
clip_rewards=True):
self.replay_starts = learning_starts
self.prioritized_replay_beta = prioritized_replay_beta
@@ -43,11 +48,14 @@ class SyncReplayOptimizer(PolicyOptimizer):
# Set up replay buffer
if prioritized_replay:
def new_buffer():
return PrioritizedReplayBuffer(
buffer_size, alpha=prioritized_replay_alpha,
buffer_size,
alpha=prioritized_replay_alpha,
clip_rewards=clip_rewards)
else:
def new_buffer():
return ReplayBuffer(buffer_size, clip_rewards)
@@ -72,17 +80,19 @@ class SyncReplayOptimizer(PolicyOptimizer):
# Handle everything as if multiagent
if isinstance(batch, SampleBatch):
batch = MultiAgentBatch(
{DEFAULT_POLICY_ID: batch}, batch.count)
batch = MultiAgentBatch({
DEFAULT_POLICY_ID: batch
}, batch.count)
for policy_id, s in batch.policy_batches.items():
for row in s.rows():
if "weights" not in row:
row["weights"] = np.ones_like(row["rewards"])
self.replay_buffers[policy_id].add(
pack_if_needed(row["obs"]), row["actions"],
row["rewards"], pack_if_needed(row["new_obs"]),
row["dones"], row["weights"])
pack_if_needed(row["obs"]),
row["actions"], row["rewards"],
pack_if_needed(row["new_obs"]), row["dones"],
row["weights"])
if self.num_steps_sampled >= self.replay_starts:
self._optimize()
@@ -112,27 +122,35 @@ class SyncReplayOptimizer(PolicyOptimizer):
with self.replay_timer:
for policy_id, replay_buffer in self.replay_buffers.items():
if isinstance(replay_buffer, PrioritizedReplayBuffer):
(obses_t, actions, rewards, obses_tp1,
dones, weights, batch_indexes) = replay_buffer.sample(
self.train_batch_size,
beta=self.prioritized_replay_beta)
(obses_t, actions, rewards, obses_tp1, dones, weights,
batch_indexes) = replay_buffer.sample(
self.train_batch_size,
beta=self.prioritized_replay_beta)
else:
(obses_t, actions, rewards, obses_tp1,
dones) = replay_buffer.sample(self.train_batch_size)
dones) = replay_buffer.sample(self.train_batch_size)
weights = np.ones_like(rewards)
batch_indexes = - np.ones_like(rewards)
batch_indexes = -np.ones_like(rewards)
samples[policy_id] = SampleBatch({
"obs": obses_t, "actions": actions, "rewards": rewards,
"new_obs": obses_tp1, "dones": dones, "weights": weights,
"batch_indexes": batch_indexes})
"obs": obses_t,
"actions": actions,
"rewards": rewards,
"new_obs": obses_tp1,
"dones": dones,
"weights": weights,
"batch_indexes": batch_indexes
})
return MultiAgentBatch(samples, self.train_batch_size)
def stats(self):
return dict(PolicyOptimizer.stats(self), **{
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
"replay_time_ms": round(1000 * self.replay_timer.mean, 3),
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
"update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
"opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
})
return dict(
PolicyOptimizer.stats(self), **{
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
"replay_time_ms": round(1000 * self.replay_timer.mean, 3),
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
"update_time_ms": round(1000 * self.update_weights_timer.mean,
3),
"opt_peak_throughput": round(self.grad_timer.mean_throughput,
3),
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
})
@@ -51,10 +51,13 @@ class SyncSamplesOptimizer(PolicyOptimizer):
return fetches
def stats(self):
return dict(PolicyOptimizer.stats(self), **{
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
"update_time_ms": round(1000 * self.update_weights_timer.mean, 3),
"opt_peak_throughput": round(self.grad_timer.mean_throughput, 3),
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
})
return dict(
PolicyOptimizer.stats(self), **{
"sample_time_ms": round(1000 * self.sample_timer.mean, 3),
"grad_time_ms": round(1000 * self.grad_timer.mean, 3),
"update_time_ms": round(1000 * self.update_weights_timer.mean,
3),
"opt_peak_throughput": round(self.grad_timer.mean_throughput,
3),
"opt_samples": round(self.grad_timer.mean_units_processed, 3),
})
+17 -11
View File
@@ -15,7 +15,6 @@ from ray.rllib.agents.agent import get_agent_class
from ray.rllib.agents.dqn.common.wrappers import wrap_dqn
from ray.rllib.models import ModelCatalog
EXAMPLE_USAGE = """
Example Usage via RLlib CLI:
rllib rollout /tmp/ray/checkpoint_dir/checkpoint-0 --run DQN
@@ -32,30 +31,37 @@ def create_parser(parser_creator=None):
parser = parser_creator(
formatter_class=argparse.RawDescriptionHelpFormatter,
description="Roll out a reinforcement learning agent "
"given a checkpoint.", epilog=EXAMPLE_USAGE)
"given a checkpoint.",
epilog=EXAMPLE_USAGE)
parser.add_argument(
"checkpoint", type=str, help="Checkpoint from which to roll out.")
required_named = parser.add_argument_group("required named arguments")
required_named.add_argument(
"--run", type=str, required=True,
"--run",
type=str,
required=True,
help="The algorithm or model to train. This may refer to the name "
"of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
"user-defined trainable function or class registered in the "
"tune registry.")
"of a built-on algorithm (e.g. RLLib's DQN or PPO), or a "
"user-defined trainable function or class registered in the "
"tune registry.")
required_named.add_argument(
"--env", type=str, help="The gym environment to use.")
parser.add_argument(
"--no-render", default=False, action="store_const", const=True,
"--no-render",
default=False,
action="store_const",
const=True,
help="Surpress rendering of the environment.")
parser.add_argument(
"--steps", default=None, help="Number of steps to roll out.")
parser.add_argument("--out", default=None, help="Output filename.")
parser.add_argument(
"--out", default=None, help="Output filename.")
parser.add_argument(
"--config", default="{}", type=json.loads,
"--config",
default="{}",
type=json.loads,
help="Algorithm-specific configuration (e.g. env, hyperparams). "
"Surpresses loading of configuration from checkpoint.")
"Surpresses loading of configuration from checkpoint.")
return parser
-1
View File
@@ -9,7 +9,6 @@ import argparse
from ray.rllib import train
from ray.rllib import rollout
EXAMPLE_USAGE = """
Example usage for training:
rllib train --run DQN --env CartPole-v0
+5 -4
View File
@@ -15,16 +15,17 @@ class _MockEvaluator(object):
self._sample_count = sample_count
self.obs_filter = MeanStdFilter(())
self.rew_filter = MeanStdFilter(())
self.filters = {"obs_filter": self.obs_filter,
"rew_filter": self.rew_filter}
self.filters = {
"obs_filter": self.obs_filter,
"rew_filter": self.rew_filter
}
def sample(self):
samples_dict = {"observations": [], "rewards": []}
for i in range(self._sample_count):
samples_dict["observations"].append(
self.obs_filter(np.random.randn()))
samples_dict["rewards"].append(
self.rew_filter(np.random.randn()))
samples_dict["rewards"].append(self.rew_filter(np.random.randn()))
return SampleBatch(samples_dict)
def compute_gradients(self, samples):
+6 -4
View File
@@ -8,8 +8,8 @@ import ray
from ray.rllib.models import ModelCatalog
from ray.rllib.models.model import Model
from ray.rllib.models.preprocessors import (
NoPreprocessor, OneHotPreprocessor, Preprocessor)
from ray.rllib.models.preprocessors import (NoPreprocessor, OneHotPreprocessor,
Preprocessor)
from ray.rllib.models.fcnet import FullyConnectedNetwork
from ray.rllib.models.visionnet import VisionNetwork
@@ -44,9 +44,11 @@ class ModelCatalogTest(unittest.TestCase):
class TupleEnv(object):
def __init__(self):
self.observation_space = Tuple(
[Discrete(5), Box(0, 1, shape=(3,), dtype=np.float32)])
[Discrete(5),
Box(0, 1, shape=(3, ), dtype=np.float32)])
p1 = ModelCatalog.get_preprocessor(TupleEnv())
self.assertEqual(p1.shape, (8,))
self.assertEqual(p1.shape, (8, ))
self.assertEqual(
list(p1.transform((0, [1, 2, 3]))),
[float(x) for x in [1, 0, 0, 0, 0, 1, 2, 3]])
@@ -20,12 +20,24 @@ def get_mean_action(alg, obs):
ray.init(num_cpus=10)
CONFIGS = {
"ES": {"episodes_per_batch": 10, "timesteps_per_batch": 100,
"num_workers": 2},
"ES": {
"episodes_per_batch": 10,
"timesteps_per_batch": 100,
"num_workers": 2
},
"DQN": {},
"DDPG": {"noise_scale": 0.0, "timesteps_per_iteration": 100},
"PPO": {"num_sgd_iter": 5, "timesteps_per_batch": 1000, "num_workers": 2},
"A3C": {"num_workers": 1},
"DDPG": {
"noise_scale": 0.0,
"timesteps_per_iteration": 100
},
"PPO": {
"num_sgd_iter": 5,
"timesteps_per_batch": 1000,
"num_workers": 2
},
"A3C": {
"num_workers": 1
},
}
+9 -7
View File
@@ -13,7 +13,7 @@ from ray.rllib.test.mock_evaluator import _MockEvaluator
class RunningStatTest(unittest.TestCase):
def testRunningStat(self):
for shp in ((), (3,), (3, 4)):
for shp in ((), (3, ), (3, 4)):
li = []
rs = RunningStat(shp)
for _ in range(5):
@@ -22,12 +22,12 @@ class RunningStatTest(unittest.TestCase):
li.append(val)
m = np.mean(li, axis=0)
self.assertTrue(np.allclose(rs.mean, m))
v = (np.square(m) if (len(li) == 1)
else np.var(li, ddof=1, axis=0))
v = (np.square(m)
if (len(li) == 1) else np.var(li, ddof=1, axis=0))
self.assertTrue(np.allclose(rs.var, v))
def testCombiningStat(self):
for shape in [(), (3,), (3, 4)]:
for shape in [(), (3, ), (3, 4)]:
li = []
rs1 = RunningStat(shape)
rs2 = RunningStat(shape)
@@ -48,7 +48,7 @@ class RunningStatTest(unittest.TestCase):
class MSFTest(unittest.TestCase):
def testBasic(self):
for shape in [(), (3,), (3, 4, 4)]:
for shape in [(), (3, ), (3, 4, 4)]:
filt = MeanStdFilter(shape)
for i in range(5):
filt(np.ones(shape))
@@ -93,8 +93,10 @@ class FilterManagerTest(unittest.TestCase):
remote_e = RemoteEvaluator.remote(sample_count=10)
remote_e.sample.remote()
FilterManager.synchronize(
{"obs_filter": filt1, "rew_filter": filt1.copy()}, [remote_e])
FilterManager.synchronize({
"obs_filter": filt1,
"rew_filter": filt1.copy()
}, [remote_e])
filters = ray.get(remote_e.get_filters.remote())
obs_f = filters["obs_filter"]
+7 -14
View File
@@ -10,22 +10,15 @@ from ray.rllib.models.lstm import chop_into_sequences
class LSTMUtilsTest(unittest.TestCase):
def testBasic(self):
t = [1, 2, 3, 1, 2, 3, 4, 5]
f = [
[101, 102, 103, 201, 202, 203, 204, 205],
[[101], [102], [103], [201], [202], [203], [204], [205]]
]
f = [[101, 102, 103, 201, 202, 203, 204, 205],
[[101], [102], [103], [201], [202], [203], [204], [205]]]
s = [[209, 208, 207, 109, 108, 107, 106, 105]]
f_pad, s_init, seq_lens = chop_into_sequences(t, f, s, 4)
self.assertEqual(
[f.tolist() for f in f_pad],
[
[101, 102, 103, 0,
201, 202, 203, 204,
205, 0, 0, 0],
[[101], [102], [103], [0],
[201], [202], [203], [204],
[205], [0], [0], [0]],
])
self.assertEqual([f.tolist() for f in f_pad], [
[101, 102, 103, 0, 201, 202, 203, 204, 205, 0, 0, 0],
[[101], [102], [103], [0], [201], [202], [203], [204], [205], [0],
[0], [0]],
])
self.assertEqual([s.tolist() for s in s_init], [[209, 109, 105]])
self.assertEqual(seq_lens.tolist(), [3, 4, 1])
+87 -42
View File
@@ -129,12 +129,21 @@ class TestMultiAgentEnv(unittest.TestCase):
obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
self.assertEqual(obs, {0: 0, 1: 0, 2: 0, 3: 0})
self.assertEqual(rew, {0: 1, 1: 1, 2: 1, 3: 1})
self.assertEqual(
done,
{0: False, 1: False, 2: False, 3: False, "__all__": False})
self.assertEqual(done, {
0: False,
1: False,
2: False,
3: False,
"__all__": False
})
obs, rew, done, info = env.step({0: 0, 1: 0, 2: 0, 3: 0})
self.assertEqual(
done, {0: True, 1: True, 2: True, 3: True, "__all__": True})
self.assertEqual(done, {
0: True,
1: True,
2: True,
3: True,
"__all__": True
})
def testRoundRobinMock(self):
env = RoundRobinMultiAgent(2)
@@ -156,24 +165,51 @@ class TestMultiAgentEnv(unittest.TestCase):
self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
self.assertEqual(rew, {0: {0: None, 1: None}, 1: {0: None, 1: None}})
self.assertEqual(
dones,
{0: {0: False, 1: False, "__all__": False},
1: {0: False, 1: False, "__all__": False}})
dones, {
0: {
0: False,
1: False,
"__all__": False
},
1: {
0: False,
1: False,
"__all__": False
}
})
for _ in range(24):
env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
obs, rew, dones, _, _ = env.poll()
self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
self.assertEqual(
dones,
{0: {0: False, 1: False, "__all__": False},
1: {0: False, 1: False, "__all__": False}})
dones, {
0: {
0: False,
1: False,
"__all__": False
},
1: {
0: False,
1: False,
"__all__": False
}
})
env.send_actions({0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
obs, rew, dones, _, _ = env.poll()
self.assertEqual(
dones,
{0: {0: True, 1: True, "__all__": True},
1: {0: True, 1: True, "__all__": True}})
dones, {
0: {
0: True,
1: True,
"__all__": True
},
1: {
0: True,
1: True,
"__all__": True
}
})
# Reset processing
self.assertRaises(
@@ -186,9 +222,18 @@ class TestMultiAgentEnv(unittest.TestCase):
self.assertEqual(obs, {0: {0: 0, 1: 0}, 1: {0: 0, 1: 0}})
self.assertEqual(rew, {0: {0: 1, 1: 1}, 1: {0: 1, 1: 1}})
self.assertEqual(
dones,
{0: {0: False, 1: False, "__all__": False},
1: {0: False, 1: False, "__all__": False}})
dones, {
0: {
0: False,
1: False,
"__all__": False
},
1: {
0: False,
1: False,
"__all__": False
}
})
def testVectorizeRoundRobin(self):
env = _MultiAgentEnvToAsync(lambda: RoundRobinMultiAgent(2), [], 2)
@@ -217,9 +262,8 @@ class TestMultiAgentEnv(unittest.TestCase):
self.assertEqual(batch.count, 50)
self.assertEqual(batch.policy_batches["p0"].count, 150)
self.assertEqual(batch.policy_batches["p1"].count, 100)
self.assertEqual(
batch.policy_batches["p0"]["t"].tolist(),
list(range(25)) * 6)
self.assertEqual(batch.policy_batches["p0"]["t"].tolist(),
list(range(25)) * 6)
def testMultiAgentSampleRoundRobin(self):
act_space = gym.spaces.Discrete(2)
@@ -236,21 +280,16 @@ class TestMultiAgentEnv(unittest.TestCase):
# since we round robin introduce agents into the env, some of the env
# steps don't count as proper transitions
self.assertEqual(batch.policy_batches["p0"].count, 42)
self.assertEqual(
batch.policy_batches["p0"]["obs"].tolist()[:10],
[0, 1, 2, 3, 4] * 2)
self.assertEqual(
batch.policy_batches["p0"]["new_obs"].tolist()[:10],
[1, 2, 3, 4, 5] * 2)
self.assertEqual(
batch.policy_batches["p0"]["rewards"].tolist()[:10],
[100, 100, 100, 100, 0] * 2)
self.assertEqual(
batch.policy_batches["p0"]["dones"].tolist()[:10],
[False, False, False, False, True] * 2)
self.assertEqual(
batch.policy_batches["p0"]["t"].tolist()[:10],
[4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
self.assertEqual(batch.policy_batches["p0"]["obs"].tolist()[:10],
[0, 1, 2, 3, 4] * 2)
self.assertEqual(batch.policy_batches["p0"]["new_obs"].tolist()[:10],
[1, 2, 3, 4, 5] * 2)
self.assertEqual(batch.policy_batches["p0"]["rewards"].tolist()[:10],
[100, 100, 100, 100, 0] * 2)
self.assertEqual(batch.policy_batches["p0"]["dones"].tolist()[:10],
[False, False, False, False, True] * 2)
self.assertEqual(batch.policy_batches["p0"]["t"].tolist()[:10],
[4, 9, 14, 19, 24, 5, 10, 15, 20, 25])
def testTrainMultiCartpoleSinglePolicy(self):
n = 10
@@ -289,11 +328,17 @@ class TestMultiAgentEnv(unittest.TestCase):
policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
batch_steps=50)
if optimizer_cls == AsyncGradientsOptimizer:
remote_evs = [PolicyEvaluator.as_remote().remote(
env_creator=lambda _: MultiCartpole(n),
policy_graph=policies,
policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2],
batch_steps=50)]
def policy_mapper(agent_id):
return ["p1", "p2"][agent_id % 2]
remote_evs = [
PolicyEvaluator.as_remote().remote(
env_creator=lambda _: MultiCartpole(n),
policy_graph=policies,
policy_mapping_fn=policy_mapper,
batch_steps=50)
]
else:
remote_evs = []
optimizer = optimizer_cls(ev, remote_evs, {})
@@ -330,8 +375,8 @@ class TestMultiAgentEnv(unittest.TestCase):
obs_space = env.observation_space
policies = {}
for i in range(20):
policies["pg_{}".format(i)] = (
PGPolicyGraph, obs_space, act_space, {})
policies["pg_{}".format(i)] = (PGPolicyGraph, obs_space, act_space,
{})
policy_ids = list(policies.keys())
ev = PolicyEvaluator(
env_creator=lambda _: MultiCartpole(n),
+2 -2
View File
@@ -21,8 +21,8 @@ class AsyncOptimizerTest(unittest.TestCase):
local = _MockEvaluator()
remotes = ray.remote(_MockEvaluator)
remote_evaluators = [remotes.remote() for i in range(5)]
test_optimizer = AsyncGradientsOptimizer(
local, remote_evaluators, {"grads_per_step": 10})
test_optimizer = AsyncGradientsOptimizer(local, remote_evaluators,
{"grads_per_step": 10})
test_optimizer.step()
self.assertTrue(all(local.get_weights() == 0))
+14 -9
View File
@@ -66,8 +66,7 @@ class MockEnv2(gym.Env):
class MockVectorEnv(VectorEnv):
def __init__(self, episode_length, num_envs):
self.envs = [
MockEnv(episode_length) for _ in range(num_envs)]
self.envs = [MockEnv(episode_length) for _ in range(num_envs)]
self.observation_space = gym.spaces.Discrete(1)
self.action_space = gym.spaces.Discrete(2)
self.num_envs = num_envs
@@ -102,7 +101,10 @@ class TestPolicyEvaluator(unittest.TestCase):
def testQueryEvaluators(self):
register_env("test", lambda _: gym.make("CartPole-v0"))
pg = PGAgent(
env="test", config={"num_workers": 2, "sample_batch_size": 5})
env="test", config={
"num_workers": 2,
"sample_batch_size": 5
})
results = pg.optimizer.foreach_evaluator(lambda ev: ev.batch_steps)
results2 = pg.optimizer.foreach_evaluator_with_index(
lambda ev, i: (i, ev.batch_steps))
@@ -112,10 +114,12 @@ class TestPolicyEvaluator(unittest.TestCase):
def testMetrics(self):
ev = PolicyEvaluator(
env_creator=lambda _: MockEnv(episode_length=10),
policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
policy_graph=MockPolicyGraph,
batch_mode="complete_episodes")
remote_ev = PolicyEvaluator.as_remote().remote(
env_creator=lambda _: MockEnv(episode_length=10),
policy_graph=MockPolicyGraph, batch_mode="complete_episodes")
policy_graph=MockPolicyGraph,
batch_mode="complete_episodes")
ev.sample()
ray.get(remote_ev.sample.remote())
result = collect_metrics(ev, [remote_ev])
@@ -149,7 +153,8 @@ class TestPolicyEvaluator(unittest.TestCase):
env_creator=lambda _: MockEnv(episode_length=20),
policy_graph=MockPolicyGraph,
batch_mode="truncate_episodes",
batch_steps=16, num_envs=8)
batch_steps=16,
num_envs=8)
for _ in range(8):
batch = ev.sample()
self.assertEqual(batch.count, 16)
@@ -175,7 +180,8 @@ class TestPolicyEvaluator(unittest.TestCase):
env_creator=lambda _: MockEnv(episode_length=8),
policy_graph=MockPolicyGraph,
batch_mode="truncate_episodes",
batch_steps=16, num_envs=4)
batch_steps=16,
num_envs=4)
batch = ev.sample()
self.assertEqual(batch.count, 16)
result = collect_metrics(ev, [])
@@ -186,8 +192,7 @@ class TestPolicyEvaluator(unittest.TestCase):
def testVectorEnvSupport(self):
ev = PolicyEvaluator(
env_creator=lambda _: MockVectorEnv(
episode_length=20, num_envs=8),
env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8),
policy_graph=MockPolicyGraph,
batch_mode="truncate_episodes",
batch_steps=10)
+6 -8
View File
@@ -83,8 +83,8 @@ class MultiServing(ServingEnv):
def __init__(self, env_creator):
self.env_creator = env_creator
self.env = env_creator()
ServingEnv.__init__(
self, self.env.action_space, self.env.observation_space)
ServingEnv.__init__(self, self.env.action_space,
self.env.observation_space)
def run(self):
envs = [self.env_creator() for _ in range(5)]
@@ -97,8 +97,7 @@ class MultiServing(ServingEnv):
eids[i] = uuid.uuid4().hex
self.start_episode(episode_id=eids[i])
cur_obs[i] = envs[i].reset()
actions = [
self.get_action(eids[i], cur_obs[i]) for i in active]
actions = [self.get_action(eids[i], cur_obs[i]) for i in active]
for i, action in zip(active, actions):
obs, reward, done, _ = envs[i].step(action)
cur_obs[i] = obs
@@ -164,8 +163,7 @@ class TestServingEnv(unittest.TestCase):
raise Exception("failed to improve reward")
def testTrainCartpole(self):
register_env(
"test", lambda _: SimpleServing(gym.make("CartPole-v0")))
register_env("test", lambda _: SimpleServing(gym.make("CartPole-v0")))
pg = PGAgent(env="test", config={"num_workers": 0})
for i in range(100):
result = pg.train()
@@ -176,8 +174,8 @@ class TestServingEnv(unittest.TestCase):
raise Exception("failed to improve reward")
def testTrainCartpoleMulti(self):
register_env(
"test2", lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
register_env("test2",
lambda _: MultiServing(lambda: gym.make("CartPole-v0")))
pg = PGAgent(env="test2", config={"num_workers": 0})
for i in range(100):
result = pg.train()
+36 -31
View File
@@ -14,27 +14,29 @@ from ray.tune.registry import register_env
ACTION_SPACES_TO_TEST = {
"discrete": Discrete(5),
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
"simple_tuple": Tuple([
Box(0.0, 1.0, (5,), dtype=np.float32),
Box(0.0, 1.0, (5,), dtype=np.float32)]),
Box(0.0, 1.0, (5, ), dtype=np.float32),
Box(0.0, 1.0, (5, ), dtype=np.float32)
]),
"implicit_tuple": [
Box(0.0, 1.0, (5,), dtype=np.float32),
Box(0.0, 1.0, (5,), dtype=np.float32)],
Box(0.0, 1.0, (5, ), dtype=np.float32),
Box(0.0, 1.0, (5, ), dtype=np.float32)
],
}
OBSERVATION_SPACES_TO_TEST = {
"discrete": Discrete(5),
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
"image": Box(0.0, 1.0, (80, 80, 1), dtype=np.float32),
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
"atari_ram": Box(0.0, 1.0, (128,), dtype=np.float32),
"atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32),
"simple_tuple": Tuple([
Box(0.0, 1.0, (5,), dtype=np.float32),
Box(0.0, 1.0, (5,), dtype=np.float32)]),
"mixed_tuple": Tuple([
Discrete(10),
Box(0.0, 1.0, (5,), dtype=np.float32)]),
Box(0.0, 1.0, (5, ), dtype=np.float32),
Box(0.0, 1.0, (5, ), dtype=np.float32)
]),
"mixed_tuple": Tuple(
[Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]),
}
@@ -90,30 +92,33 @@ class ModelSupportedSpaces(unittest.TestCase):
stats = {}
check_support("DDPG", {"timesteps_per_iteration": 1}, stats)
check_support("DQN", {"timesteps_per_iteration": 1}, stats)
check_support("A3C", {
"num_workers": 1,
"optimizer": {
"grads_per_step": 1
}
}, stats)
check_support(
"A3C", {"num_workers": 1, "optimizer": {"grads_per_step": 1}},
stats)
"PPO", {
"num_workers": 1,
"num_sgd_iter": 1,
"timesteps_per_batch": 1,
"sgd_batchsize": 1
}, stats)
check_support(
"PPO",
{"num_workers": 1, "num_sgd_iter": 1, "timesteps_per_batch": 1,
"sgd_batchsize": 1},
stats)
check_support(
"ES",
{"num_workers": 1, "noise_size": 10000000,
"episodes_per_batch": 1, "timesteps_per_batch": 1},
stats)
check_support(
"PG",
{"num_workers": 1, "optimizer": {}},
stats)
"ES", {
"num_workers": 1,
"noise_size": 10000000,
"episodes_per_batch": 1,
"timesteps_per_batch": 1
}, stats)
check_support("PG", {"num_workers": 1, "optimizer": {}}, stats)
num_unexpected_errors = 0
for (alg, a_name, o_name), stat in sorted(stats.items()):
if stat not in ["ok", "unsupported"]:
num_unexpected_errors += 1
print(
alg, "action_space", a_name, "obs_space", o_name,
"result", stat)
print(alg, "action_space", a_name, "obs_space", o_name, "result",
stat)
self.assertEqual(num_unexpected_errors, 0)
@@ -123,7 +128,7 @@ if __name__ == "__main__":
"discrete": Discrete(5),
}
OBSERVATION_SPACES_TO_TEST = {
"vector": Box(0.0, 1.0, (5,), dtype=np.float32),
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
}
unittest.main(verbosity=2)
+24 -11
View File
@@ -11,7 +11,6 @@ import ray
from ray.tune.config_parser import make_parser, resources_to_json
from ray.tune.tune import _make_scheduler, run_experiments
EXAMPLE_USAGE = """
Training example via RLlib CLI:
rllib train --run DQN --env CartPole-v0
@@ -35,29 +34,41 @@ def create_parser(parser_creator=None):
# See also the base parser definition in ray/tune/config_parser.py
parser.add_argument(
"--redis-address", default=None, type=str,
"--redis-address",
default=None,
type=str,
help="The Redis address of the cluster.")
parser.add_argument(
"--ray-num-cpus", default=None, type=int,
"--ray-num-cpus",
default=None,
type=int,
help="--num-cpus to pass to Ray."
" This only has an affect in local mode.")
" This only has an affect in local mode.")
parser.add_argument(
"--ray-num-gpus", default=None, type=int,
"--ray-num-gpus",
default=None,
type=int,
help="--num-gpus to pass to Ray."
" This only has an affect in local mode.")
" This only has an affect in local mode.")
parser.add_argument(
"--experiment-name", default="default", type=str,
"--experiment-name",
default="default",
type=str,
help="Name of the subdirectory under `local_dir` to put results in.")
parser.add_argument(
"--env", default=None, type=str, help="The gym environment to use.")
parser.add_argument(
"--queue-trials", action='store_true',
"--queue-trials",
action='store_true',
help=(
"Whether to queue trials when the cluster does not currently have "
"enough resources to launch one. This should be set to True when "
"running on an autoscaling cluster to enable automatic scale-up."))
parser.add_argument(
"-f", "--config-file", default=None, type=str,
"-f",
"--config-file",
default=None,
type=str,
help="If specified, use config options from this file. Note that this "
"overrides any trial-specific options set via flags above.")
return parser
@@ -93,9 +104,11 @@ def run(args, parser):
ray.init(
redis_address=args.redis_address,
num_cpus=args.ray_num_cpus, num_gpus=args.ray_num_gpus)
num_cpus=args.ray_num_cpus,
num_gpus=args.ray_num_gpus)
run_experiments(
experiments, scheduler=_make_scheduler(args),
experiments,
scheduler=_make_scheduler(args),
queue_trials=args.queue_trials)
@@ -6,10 +6,8 @@ import re
import os
import os.path as osp
CONFIG_DIR = osp.join(osp.dirname(osp.abspath(__file__)), "regression_tests")
TEMPLATE = """
class Test{name}(Regression):
_file = "{filename}"
@@ -15,7 +15,6 @@ import yaml
import ray
from ray import tune
CONFIG_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -8,7 +8,6 @@ import yaml
import ray
from ray.tune import run_experiments
if __name__ == '__main__':
experiments = {}
@@ -29,5 +28,4 @@ if __name__ == '__main__':
num_failures += 1
if num_failures:
raise Exception(
"{} trials did not converge".format(num_failures))
raise Exception("{} trials did not converge".format(num_failures))
+3 -4
View File
@@ -11,10 +11,9 @@ try:
import lz4.frame
LZ4_ENABLED = True
except ImportError:
print(
"WARNING: lz4 not available, disabling sample compression. "
"This will significantly impact RLlib performance. "
"To install lz4, run `pip install lz4`.")
print("WARNING: lz4 not available, disabling sample compression. "
"This will significantly impact RLlib performance. "
"To install lz4, run `pip install lz4`.")
LZ4_ENABLED = False
+6 -7
View File
@@ -59,7 +59,6 @@ class NoFilter(Filter):
# http://www.johndcook.com/blog/standard_deviation/
class RunningStat(object):
def __init__(self, shape=None):
self._n = 0
self._M = np.zeros(shape)
@@ -227,8 +226,8 @@ class MeanStdFilter(Filter):
def __repr__(self):
return 'MeanStdFilter({}, {}, {}, {}, {}, {})'.format(
self.shape, self.demean, self.destd,
self.clip, self.rs, self.buffer)
self.shape, self.demean, self.destd, self.clip, self.rs,
self.buffer)
class ConcurrentMeanStdFilter(MeanStdFilter):
@@ -242,6 +241,7 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
def wrapper(*args, **kwargs):
with self._lock:
return func(*args, **kwargs)
return wrapper
self.__getattribute__ = lock_wrap(self.__getattribute__)
@@ -260,8 +260,8 @@ class ConcurrentMeanStdFilter(MeanStdFilter):
def __repr__(self):
return 'ConcurrentMeanStdFilter({}, {}, {}, {}, {}, {})'.format(
self.shape, self.demean, self.destd,
self.clip, self.rs, self.buffer)
self.shape, self.demean, self.destd, self.clip, self.rs,
self.buffer)
def get_filter(filter_config, shape):
@@ -273,5 +273,4 @@ def get_filter(filter_config, shape):
elif filter_config == "NoFilter":
return NoFilter()
else:
raise Exception("Unknown observation_filter: " +
str(filter_config))
raise Exception("Unknown observation_filter: " + str(filter_config))
+6 -6
View File
@@ -75,14 +75,14 @@ def _make_handler(serving_env):
response["action"] = serving_env.get_action(
args["episode_id"], args["observation"])
elif command == PolicyClient.LOG_ACTION:
serving_env.log_action(
args["episode_id"], args["observation"], args["action"])
serving_env.log_action(args["episode_id"], args["observation"],
args["action"])
elif command == PolicyClient.LOG_RETURNS:
serving_env.log_returns(
args["episode_id"], args["reward"], args["info"])
serving_env.log_returns(args["episode_id"], args["reward"],
args["info"])
elif command == PolicyClient.END_EPISODE:
serving_env.end_episode(
args["episode_id"], args["observation"])
serving_env.end_episode(args["episode_id"],
args["observation"])
else:
raise Exception("Unknown command: {}".format(command))
return response
+8 -7
View File
@@ -7,6 +7,7 @@ class Reshaper(object):
This class keeps track of where in the flattened observation space
we should be slicing and what the new shapes should be
"""
def __init__(self, env_space):
self.shapes = []
self.slice_positions = []
@@ -24,8 +25,8 @@ class Reshaper(object):
if len(self.slice_positions) == 0:
self.slice_positions.append(np.product(arr_shape))
else:
self.slice_positions.append(np.product(arr_shape) +
self.slice_positions[-1])
self.slice_positions.append(
np.product(arr_shape) + self.slice_positions[-1])
else:
self.shapes.append(np.asarray(env_space.shape))
self.slice_positions.append(np.product(env_space.shape))
@@ -38,11 +39,11 @@ class Reshaper(object):
def split_tensor(self, tensor, axis=-1):
# FIXME (ev) This won't work for mixed action distributions like
# one agent Gaussian one agent discrete
slice_rescale = int(tensor.shape.as_list()[axis] /
int(np.sum(self.get_slice_lengths())))
return tf.split(tensor, slice_rescale*self.get_slice_lengths(),
axis=axis)
slice_rescale = int(tensor.shape.as_list()[axis] / int(
np.sum(self.get_slice_lengths())))
return tf.split(
tensor, slice_rescale * self.get_slice_lengths(), axis=axis)
def split_number(self, number):
slice_rescale = int(number / int(np.sum(self.get_slice_lengths())))
return slice_rescale*self.get_slice_lengths()
return slice_rescale * self.get_slice_lengths()
+4 -4
View File
@@ -39,10 +39,10 @@ def linear_interpolation(l, r, alpha):
class PiecewiseSchedule(object):
def __init__(
self, endpoints, interpolation=linear_interpolation,
outside_value=None):
def __init__(self,
endpoints,
interpolation=linear_interpolation,
outside_value=None):
"""Piecewise schedule.
endpoints: [(int, int)]
+7 -6
View File
@@ -64,18 +64,19 @@ def run_timeline(sess, ops, debug_name, feed_dict={}, timeline_dir=None):
run_metadata = tf.RunMetadata()
start = time.time()
fetches = sess.run(
ops, options=run_options, run_metadata=run_metadata,
ops,
options=run_options,
run_metadata=run_metadata,
feed_dict=feed_dict)
trace = timeline.Timeline(step_stats=run_metadata.step_stats)
global _count
outf = os.path.join(
timeline_dir,
"timeline-{}-{}-{}.json".format(debug_name, os.getpid(), _count))
timeline_dir, "timeline-{}-{}-{}.json".format(
debug_name, os.getpid(), _count))
_count += 1
trace_file = open(outf, "w")
print(
"Wrote tf timeline ({} s) to {}".format(
time.time() - start, os.path.abspath(outf)))
print("Wrote tf timeline ({} s) to {}".format(time.time() - start,
os.path.abspath(outf)))
trace_file.write(trace.generate_chrome_trace_format())
else:
fetches = sess.run(ops, feed_dict=feed_dict)
+2 -2
View File
@@ -22,8 +22,8 @@ class WindowStat(object):
if not self.count:
quantiles = []
else:
quantiles = np.percentile(
self.items[:self.count], [0, 10, 50, 90, 100]).tolist()
quantiles = np.percentile(self.items[:self.count],
[0, 10, 50, 90, 100]).tolist()
return {
self.name + "_count": int(self.count),
self.name + "_mean": float(np.mean(self.items[:self.count])),