mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:06:31 +08:00
[rllib] Native support for Dict and Tuple spaces; fix Tuple action spaces; add prev a, r to LSTM (#3051)
This commit is contained in:
@@ -49,8 +49,13 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
tf.float32, [None] + list(observation_space.shape))
|
||||
dist_class, logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space, self.config["model"])
|
||||
self.model = ModelCatalog.get_model(self.observations, logit_dim,
|
||||
self.config["model"])
|
||||
prev_actions = ModelCatalog.get_action_placeholder(action_space)
|
||||
prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
|
||||
self.model = ModelCatalog.get_model({
|
||||
"obs": self.observations,
|
||||
"prev_actions": prev_actions,
|
||||
"prev_rewards": prev_rewards
|
||||
}, observation_space, logit_dim, self.config["model"])
|
||||
action_dist = dist_class(self.model.outputs)
|
||||
self.vf = tf.reshape(
|
||||
linear(self.model.last_layer, 1, "value", normc_initializer(1.0)),
|
||||
@@ -78,6 +83,8 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
loss_in = [
|
||||
("obs", self.observations),
|
||||
("actions", actions),
|
||||
("prev_actions", prev_actions),
|
||||
("prev_rewards", prev_rewards),
|
||||
("advantages", advantages),
|
||||
("value_targets", self.v_target),
|
||||
]
|
||||
@@ -94,6 +101,8 @@ class A3CPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
loss_inputs=loss_in,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
prev_action_input=prev_actions,
|
||||
prev_reward_input=prev_rewards,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=self.config["model"]["max_seq_len"])
|
||||
|
||||
|
||||
@@ -51,7 +51,8 @@ COMMON_CONFIG = {
|
||||
"env_config": {},
|
||||
# Environment name can also be passed via config
|
||||
"env": None,
|
||||
# Arguments to pass to model
|
||||
# Arguments to pass to model. See models/catalog.py for a full list of the
|
||||
# available model options.
|
||||
"model": MODEL_DEFAULTS,
|
||||
# Arguments to pass to the policy optimizer. These vary by optimizer.
|
||||
"optimizer": {},
|
||||
@@ -196,6 +197,8 @@ class Agent(Trainable):
|
||||
|
||||
# Agents allow env ids to be passed directly to the constructor.
|
||||
self._env_id = env or config.get("env")
|
||||
if not self._env_id:
|
||||
raise ValueError("Must specify env (str) when creating agent")
|
||||
|
||||
# Create a default logger creator if no logger_creator is specified
|
||||
if logger_creator is None:
|
||||
|
||||
@@ -77,8 +77,8 @@ class Worker(object):
|
||||
|
||||
self.sess = utils.make_session(single_threaded=True)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, self.env.action_space, self.preprocessor,
|
||||
config["observation_filter"], config["model"])
|
||||
self.sess, self.env.action_space, self.env.observation_space,
|
||||
self.preprocessor, config["observation_filter"], config["model"])
|
||||
|
||||
def rollout(self, timestep_limit, add_noise=False):
|
||||
rollout_rewards, rollout_length = policies.rollout(
|
||||
@@ -153,7 +153,7 @@ class ARSAgent(Agent):
|
||||
|
||||
self.sess = utils.make_session(single_threaded=False)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, env.action_space, preprocessor,
|
||||
self.sess, env.action_space, env.observation_space, preprocessor,
|
||||
self.config["observation_filter"], self.config["model"])
|
||||
self.optimizer = optimizers.SGD(self.policy,
|
||||
self.config["sgd_stepsize"])
|
||||
|
||||
@@ -10,6 +10,7 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
@@ -56,6 +57,7 @@ class GenericPolicy(object):
|
||||
def __init__(self,
|
||||
sess,
|
||||
action_space,
|
||||
obs_space,
|
||||
preprocessor,
|
||||
observation_filter,
|
||||
model_config,
|
||||
@@ -73,7 +75,9 @@ class GenericPolicy(object):
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
action_space, model_config, dist_type="deterministic")
|
||||
|
||||
model = ModelCatalog.get_model(self.inputs, dist_dim, model_config)
|
||||
model = ModelCatalog.get_model({
|
||||
"obs": self.inputs
|
||||
}, obs_space, dist_dim, model_config)
|
||||
dist = dist_class(model.outputs)
|
||||
self.sampler = dist.sample()
|
||||
|
||||
@@ -90,6 +94,7 @@ class GenericPolicy(object):
|
||||
observation = self.observation_filter(observation[None], update=update)
|
||||
action = self.sess.run(
|
||||
self.sampler, feed_dict={self.inputs: observation})
|
||||
action = _unbatch_tuple_actions(action)
|
||||
if add_noise and isinstance(self.action_space, gym.spaces.Box):
|
||||
action += np.random.randn(*action.shape) * self.action_noise_std
|
||||
return action
|
||||
|
||||
@@ -149,7 +149,8 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# Actor: P (policy) network
|
||||
with tf.variable_scope(P_SCOPE) as scope:
|
||||
p_values = self._build_p_network(self.cur_observations)
|
||||
p_values = self._build_p_network(self.cur_observations,
|
||||
observation_space)
|
||||
self.p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
@@ -178,11 +179,11 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# p network evaluation
|
||||
with tf.variable_scope(P_SCOPE, reuse=True) as scope:
|
||||
self.p_t = self._build_p_network(self.obs_t)
|
||||
self.p_t = self._build_p_network(self.obs_t, observation_space)
|
||||
|
||||
# target p network evaluation
|
||||
with tf.variable_scope(P_TARGET_SCOPE) as scope:
|
||||
p_tp1 = self._build_p_network(self.obs_tp1)
|
||||
p_tp1 = self._build_p_network(self.obs_tp1, observation_space)
|
||||
target_p_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
@@ -197,14 +198,16 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_t = self._build_q_network(self.obs_t, self.act_t)
|
||||
q_t = self._build_q_network(self.obs_t, observation_space,
|
||||
self.act_t)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp0 = self._build_q_network(self.obs_t, output_actions)
|
||||
q_tp0 = self._build_q_network(self.obs_t, observation_space,
|
||||
output_actions)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1 = self._build_q_network(self.obs_tp1,
|
||||
q_tp1 = self._build_q_network(self.obs_tp1, observation_space,
|
||||
output_actions_estimated)
|
||||
target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
@@ -267,16 +270,20 @@ class DDPGPolicyGraph(TFPolicyGraph):
|
||||
# Hard initial update
|
||||
self.update_target(tau=1.0)
|
||||
|
||||
def _build_q_network(self, obs, actions):
|
||||
def _build_q_network(self, obs, obs_space, actions):
|
||||
return QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]), actions,
|
||||
ModelCatalog.get_model({
|
||||
"obs": obs
|
||||
}, obs_space, 1, self.config["model"]), actions,
|
||||
self.config["critic_hiddens"],
|
||||
self.config["critic_hidden_activation"]).value
|
||||
|
||||
def _build_p_network(self, obs):
|
||||
def _build_p_network(self, obs, obs_space):
|
||||
return PNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]),
|
||||
self.dim_actions, self.config["actor_hiddens"],
|
||||
ModelCatalog.get_model({
|
||||
"obs": obs
|
||||
}, obs_space, 1, self.config["model"]), self.dim_actions,
|
||||
self.config["actor_hiddens"],
|
||||
self.config["actor_hidden_activation"]).action_scores
|
||||
|
||||
def _build_action_network(self, p_values, stochastic, eps):
|
||||
|
||||
@@ -275,7 +275,7 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
# Action Q network
|
||||
with tf.variable_scope(Q_SCOPE) as scope:
|
||||
q_values, q_logits, q_dist = self._build_q_network(
|
||||
self.cur_observations)
|
||||
self.cur_observations, observation_space)
|
||||
self.q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# Action outputs
|
||||
@@ -294,12 +294,13 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# q network evaluation
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_t, q_logits_t, q_dist_t = self._build_q_network(self.obs_t)
|
||||
q_t, q_logits_t, q_dist_t = self._build_q_network(
|
||||
self.obs_t, observation_space)
|
||||
|
||||
# target q network evalution
|
||||
with tf.variable_scope(Q_TARGET_SCOPE) as scope:
|
||||
q_tp1, q_logits_tp1, q_dist_tp1 = self._build_q_network(
|
||||
self.obs_tp1)
|
||||
self.obs_tp1, observation_space)
|
||||
self.target_q_func_vars = _scope_vars(scope.name)
|
||||
|
||||
# q scores for actions which we know were selected in the given state.
|
||||
@@ -313,7 +314,7 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
q_tp1_using_online_net, q_logits_tp1_using_online_net, \
|
||||
q_dist_tp1_using_online_net = self._build_q_network(
|
||||
self.obs_tp1)
|
||||
self.obs_tp1, observation_space)
|
||||
q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
|
||||
q_tp1_best_one_hot_selection = tf.one_hot(
|
||||
q_tp1_best_using_online_net, self.num_actions)
|
||||
@@ -362,10 +363,12 @@ class DQNPolicyGraph(TFPolicyGraph):
|
||||
loss_inputs=self.loss_inputs)
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
|
||||
def _build_q_network(self, obs):
|
||||
def _build_q_network(self, obs, space):
|
||||
qnet = QNetwork(
|
||||
ModelCatalog.get_model(obs, 1, self.config["model"]),
|
||||
self.num_actions, self.config["dueling"], self.config["hiddens"],
|
||||
ModelCatalog.get_model({
|
||||
"obs": obs
|
||||
}, space, 1, self.config["model"]), self.num_actions,
|
||||
self.config["dueling"], self.config["hiddens"],
|
||||
self.config["noisy"], self.config["num_atoms"],
|
||||
self.config["v_min"], self.config["v_max"], self.config["sigma0"])
|
||||
return qnet.value, qnet.logits, qnet.dist
|
||||
|
||||
@@ -81,8 +81,9 @@ class Worker(object):
|
||||
|
||||
self.sess = utils.make_session(single_threaded=True)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, self.env.action_space, self.preprocessor,
|
||||
config["observation_filter"], config["model"], **policy_params)
|
||||
self.sess, self.env.action_space, self.env.observation_space,
|
||||
self.preprocessor, config["observation_filter"], config["model"],
|
||||
**policy_params)
|
||||
|
||||
def rollout(self, timestep_limit, add_noise=True):
|
||||
rollout_rewards, rollout_length = policies.rollout(
|
||||
@@ -161,7 +162,7 @@ class ESAgent(Agent):
|
||||
|
||||
self.sess = utils.make_session(single_threaded=False)
|
||||
self.policy = policies.GenericPolicy(
|
||||
self.sess, env.action_space, preprocessor,
|
||||
self.sess, env.action_space, env.observation_space, preprocessor,
|
||||
self.config["observation_filter"], self.config["model"],
|
||||
**policy_params)
|
||||
self.optimizer = optimizers.Adam(self.policy, self.config["stepsize"])
|
||||
|
||||
@@ -10,6 +10,7 @@ import numpy as np
|
||||
import tensorflow as tf
|
||||
|
||||
import ray
|
||||
from ray.rllib.evaluation.sampler import _unbatch_tuple_actions
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.filter import get_filter
|
||||
|
||||
@@ -38,8 +39,8 @@ def rollout(policy, env, timestep_limit=None, add_noise=False):
|
||||
|
||||
|
||||
class GenericPolicy(object):
|
||||
def __init__(self, sess, action_space, preprocessor, observation_filter,
|
||||
model_options, action_noise_std):
|
||||
def __init__(self, sess, action_space, obs_space, preprocessor,
|
||||
observation_filter, model_options, action_noise_std):
|
||||
self.sess = sess
|
||||
self.action_space = action_space
|
||||
self.action_noise_std = action_noise_std
|
||||
@@ -52,7 +53,9 @@ class GenericPolicy(object):
|
||||
# Policy network.
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
self.action_space, model_options, dist_type="deterministic")
|
||||
model = ModelCatalog.get_model(self.inputs, dist_dim, model_options)
|
||||
model = ModelCatalog.get_model({
|
||||
"obs": self.inputs
|
||||
}, obs_space, dist_dim, model_options)
|
||||
dist = dist_class(model.outputs)
|
||||
self.sampler = dist.sample()
|
||||
|
||||
@@ -69,6 +72,7 @@ class GenericPolicy(object):
|
||||
observation = self.observation_filter(observation[None], update=update)
|
||||
action = self.sess.run(
|
||||
self.sampler, feed_dict={self.inputs: observation})
|
||||
action = _unbatch_tuple_actions(action)
|
||||
if add_noise and isinstance(self.action_space, gym.spaces.Box):
|
||||
action += np.random.randn(*action.shape) * self.action_noise_std
|
||||
return action
|
||||
|
||||
@@ -102,9 +102,9 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
|
||||
# Create input placeholders
|
||||
if existing_inputs:
|
||||
actions, dones, behaviour_logits, rewards, observations = \
|
||||
existing_inputs[:5]
|
||||
existing_state_in = existing_inputs[5:-1]
|
||||
actions, dones, behaviour_logits, rewards, observations, \
|
||||
prev_actions, prev_rewards = existing_inputs[:7]
|
||||
existing_state_in = existing_inputs[7:-1]
|
||||
existing_seq_lens = existing_inputs[-1]
|
||||
else:
|
||||
if isinstance(action_space, gym.spaces.Discrete):
|
||||
@@ -126,8 +126,15 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
# Setup the policy
|
||||
dist_class, logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space, self.config["model"])
|
||||
prev_actions = ModelCatalog.get_action_placeholder(action_space)
|
||||
prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
|
||||
self.model = ModelCatalog.get_model(
|
||||
observations,
|
||||
{
|
||||
"obs": observations,
|
||||
"prev_actions": prev_actions,
|
||||
"prev_rewards": prev_rewards,
|
||||
},
|
||||
observation_space,
|
||||
logit_dim,
|
||||
self.config["model"],
|
||||
state_in=existing_state_in,
|
||||
@@ -187,6 +194,8 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
("behaviour_logits", behaviour_logits),
|
||||
("rewards", rewards),
|
||||
("obs", observations),
|
||||
("prev_actions", prev_actions),
|
||||
("prev_rewards", prev_rewards),
|
||||
]
|
||||
LearningRateSchedule.__init__(self, self.config["lr"],
|
||||
self.config["lr_schedule"])
|
||||
@@ -201,6 +210,8 @@ class VTracePolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
loss_inputs=loss_in,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
prev_action_input=prev_actions,
|
||||
prev_reward_input=prev_rewards,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=self.config["model"]["max_seq_len"])
|
||||
|
||||
|
||||
@@ -24,8 +24,13 @@ class PGPolicyGraph(TFPolicyGraph):
|
||||
obs = tf.placeholder(tf.float32, shape=[None] + list(obs_space.shape))
|
||||
dist_class, self.logit_dim = ModelCatalog.get_action_dist(
|
||||
action_space, self.config["model"])
|
||||
self.model = ModelCatalog.get_model(obs, self.logit_dim,
|
||||
self.config["model"])
|
||||
prev_actions = ModelCatalog.get_action_placeholder(action_space)
|
||||
prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward")
|
||||
self.model = ModelCatalog.get_model({
|
||||
"obs": obs,
|
||||
"prev_actions": prev_actions,
|
||||
"prev_rewards": prev_rewards
|
||||
}, obs_space, self.logit_dim, self.config["model"])
|
||||
action_dist = dist_class(self.model.outputs) # logit for each action
|
||||
|
||||
# Setup policy loss
|
||||
@@ -35,9 +40,12 @@ class PGPolicyGraph(TFPolicyGraph):
|
||||
|
||||
# Initialize TFPolicyGraph
|
||||
sess = tf.get_default_session()
|
||||
# Mapping from sample batch keys to placeholders
|
||||
loss_in = [
|
||||
("obs", obs),
|
||||
("actions", actions),
|
||||
("prev_actions", prev_actions),
|
||||
("prev_rewards", prev_rewards),
|
||||
("advantages", advantages),
|
||||
]
|
||||
|
||||
@@ -52,6 +60,8 @@ class PGPolicyGraph(TFPolicyGraph):
|
||||
loss_inputs=loss_in,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
prev_action_input=prev_actions,
|
||||
prev_reward_input=prev_rewards,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=config["model"]["max_seq_len"])
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
@@ -88,7 +88,7 @@ class PPOAgent(Agent):
|
||||
self.optimizer = SyncSamplesOptimizer(
|
||||
self.local_evaluator, self.remote_evaluators, {
|
||||
"num_sgd_iter": self.config["num_sgd_iter"],
|
||||
"train_batch_size": self.config["train_batch_size"]
|
||||
"train_batch_size": self.config["train_batch_size"],
|
||||
})
|
||||
else:
|
||||
self.optimizer = LocalMultiGPUOptimizer(
|
||||
|
||||
@@ -120,8 +120,9 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
|
||||
if existing_inputs:
|
||||
obs_ph, value_targets_ph, adv_ph, act_ph, \
|
||||
logits_ph, vf_preds_ph = existing_inputs[:6]
|
||||
existing_state_in = existing_inputs[6:-1]
|
||||
logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \
|
||||
existing_inputs[:8]
|
||||
existing_state_in = existing_inputs[8:-1]
|
||||
existing_seq_lens = existing_inputs[-1]
|
||||
else:
|
||||
obs_ph = tf.placeholder(
|
||||
@@ -137,6 +138,9 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
tf.float32, name="vf_preds", shape=(None, ))
|
||||
value_targets_ph = tf.placeholder(
|
||||
tf.float32, name="value_targets", shape=(None, ))
|
||||
prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
|
||||
prev_rewards_ph = tf.placeholder(
|
||||
tf.float32, [None], name="prev_reward")
|
||||
existing_state_in = None
|
||||
existing_seq_lens = None
|
||||
self.observations = obs_ph
|
||||
@@ -148,9 +152,16 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
("actions", act_ph),
|
||||
("logits", logits_ph),
|
||||
("vf_preds", vf_preds_ph),
|
||||
("prev_actions", prev_actions_ph),
|
||||
("prev_rewards", prev_rewards_ph),
|
||||
]
|
||||
self.model = ModelCatalog.get_model(
|
||||
obs_ph,
|
||||
{
|
||||
"obs": obs_ph,
|
||||
"prev_actions": prev_actions_ph,
|
||||
"prev_rewards": prev_rewards_ph
|
||||
},
|
||||
observation_space,
|
||||
logit_dim,
|
||||
self.config["model"],
|
||||
state_in=existing_state_in,
|
||||
@@ -180,8 +191,11 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
vf_config["free_log_std"] = False
|
||||
vf_config["use_lstm"] = False
|
||||
with tf.variable_scope("value_function"):
|
||||
self.value_function = ModelCatalog.get_model(
|
||||
obs_ph, 1, vf_config).outputs
|
||||
self.value_function = ModelCatalog.get_model({
|
||||
"obs": obs_ph,
|
||||
"prev_actions": prev_actions_ph,
|
||||
"prev_rewards": prev_rewards_ph
|
||||
}, observation_space, 1, vf_config).outputs
|
||||
self.value_function = tf.reshape(self.value_function, [-1])
|
||||
else:
|
||||
self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])
|
||||
@@ -223,6 +237,8 @@ class PPOPolicyGraph(LearningRateSchedule, TFPolicyGraph):
|
||||
loss_inputs=self.loss_in,
|
||||
state_inputs=self.model.state_in,
|
||||
state_outputs=self.model.state_out,
|
||||
prev_action_input=prev_actions_ph,
|
||||
prev_reward_input=prev_rewards_ph,
|
||||
seq_lens=self.model.seq_lens,
|
||||
max_seq_len=config["model"]["max_seq_len"])
|
||||
|
||||
|
||||
+19
-2
@@ -22,6 +22,12 @@ class AsyncVectorEnv(object):
|
||||
rllib.MultiAgentEnv => rllib.AsyncVectorEnv
|
||||
rllib.ServingEnv => rllib.AsyncVectorEnv
|
||||
|
||||
Attributes:
|
||||
action_space (gym.Space): Action space. This must be defined for
|
||||
single-agent envs. Multi-agent envs can set this to None.
|
||||
observation_space (gym.Space): Observation space. This must be defined
|
||||
for single-agent envs. Multi-agent envs can set this to None.
|
||||
|
||||
Examples:
|
||||
>>> env = MyAsyncVectorEnv()
|
||||
>>> obs, rewards, dones, infos, off_policy_actions = env.poll()
|
||||
@@ -142,8 +148,14 @@ def _with_dummy_agent_id(env_id_to_values, dummy_id=_DUMMY_AGENT_ID):
|
||||
class _ServingEnvToAsync(AsyncVectorEnv):
|
||||
"""Internal adapter of ServingEnv to AsyncVectorEnv."""
|
||||
|
||||
def __init__(self, serving_env):
|
||||
def __init__(self, serving_env, preprocessor=None):
|
||||
self.serving_env = serving_env
|
||||
self.prep = preprocessor
|
||||
self.action_space = serving_env.action_space
|
||||
if preprocessor:
|
||||
self.observation_space = preprocessor.observation_space
|
||||
else:
|
||||
self.observation_space = serving_env.observation_space
|
||||
serving_env.start()
|
||||
|
||||
def poll(self):
|
||||
@@ -168,7 +180,10 @@ class _ServingEnvToAsync(AsyncVectorEnv):
|
||||
if episode.cur_done:
|
||||
del self.serving_env._episodes[eid]
|
||||
if data:
|
||||
all_obs[eid] = data["obs"]
|
||||
if self.prep:
|
||||
all_obs[eid] = self.prep.transform(data["obs"])
|
||||
else:
|
||||
all_obs[eid] = data["obs"]
|
||||
all_rewards[eid] = data["reward"]
|
||||
all_dones[eid] = data["done"]
|
||||
all_infos[eid] = data["info"]
|
||||
@@ -196,6 +211,8 @@ class _VectorEnvToAsync(AsyncVectorEnv):
|
||||
|
||||
def __init__(self, vector_env):
|
||||
self.vector_env = vector_env
|
||||
self.action_space = vector_env.action_space
|
||||
self.observation_space = vector_env.observation_space
|
||||
self.num_envs = vector_env.num_envs
|
||||
self.new_obs = self.vector_env.vector_reset()
|
||||
self.cur_rewards = [None for _ in range(self.num_envs)]
|
||||
|
||||
Vendored
+10
-4
@@ -25,6 +25,10 @@ class ServingEnv(threading.Thread):
|
||||
|
||||
This env is thread-safe, but individual episodes must be executed serially.
|
||||
|
||||
Attributes:
|
||||
action_space (gym.Space): Action space.
|
||||
observation_space (gym.Space): Observation space.
|
||||
|
||||
Examples:
|
||||
>>> register_env("my_env", lambda config: YourServingEnv(config))
|
||||
>>> agent = DQNAgent(env="my_env")
|
||||
@@ -57,10 +61,12 @@ class ServingEnv(threading.Thread):
|
||||
"""Override this to implement the run loop.
|
||||
|
||||
Your loop should continuously:
|
||||
1. Call self.start_episode()
|
||||
2. Call self.get_action() or self.log_action()
|
||||
3. Call self.log_returns()
|
||||
4. Call self.end_episode()
|
||||
1. Call self.start_episode(episode_id)
|
||||
2. Call self.get_action(episode_id, obs)
|
||||
-or-
|
||||
self.log_action(episode_id, obs, action)
|
||||
3. Call self.log_returns(episode_id, reward)
|
||||
4. Call self.end_episode(episode_id, obs)
|
||||
5. Wait if nothing to do.
|
||||
|
||||
Multiple episodes may be started at the same time.
|
||||
|
||||
Vendored
+2
@@ -69,6 +69,8 @@ class _VectorizedGymEnv(VectorEnv):
|
||||
self.num_envs = num_envs
|
||||
while len(self.envs) < self.num_envs:
|
||||
self.envs.append(self.make_env(len(self.envs)))
|
||||
self.action_space = self.envs[0].action_space
|
||||
self.observation_space = self.envs[0].observation_space
|
||||
|
||||
def vector_reset(self):
|
||||
return [e.reset() for e in self.envs]
|
||||
|
||||
@@ -54,6 +54,8 @@ class MultiAgentEpisode(object):
|
||||
self._agent_to_last_obs = {}
|
||||
self._agent_to_last_action = {}
|
||||
self._agent_to_last_pi_info = {}
|
||||
self._agent_to_prev_action = {}
|
||||
self._agent_reward_history = defaultdict(list)
|
||||
|
||||
def policy_for(self, agent_id):
|
||||
"""Returns the policy graph for the specified agent.
|
||||
@@ -72,19 +74,33 @@ class MultiAgentEpisode(object):
|
||||
return self._agent_to_last_obs.get(agent_id)
|
||||
|
||||
def last_action_for(self, agent_id):
|
||||
"""Returns the last action for the specified agent."""
|
||||
"""Returns the last action for the specified agent, or zeros."""
|
||||
|
||||
action = self._agent_to_last_action[agent_id]
|
||||
# Concatenate tuple actions
|
||||
if isinstance(action, list):
|
||||
expanded = []
|
||||
for a in action:
|
||||
if len(a.shape) == 1:
|
||||
expanded.append(np.expand_dims(a, 1))
|
||||
else:
|
||||
expanded.append(a)
|
||||
action = np.concatenate(expanded, axis=1).flatten()
|
||||
return action
|
||||
if agent_id in self._agent_to_last_action:
|
||||
return _flatten_action(self._agent_to_last_action[agent_id])
|
||||
else:
|
||||
policy = self._policies[self.policy_for(agent_id)]
|
||||
flat = _flatten_action(policy.action_space.sample())
|
||||
return np.zeros_like(flat)
|
||||
|
||||
def prev_action_for(self, agent_id):
|
||||
"""Returns the previous action for the specified agent."""
|
||||
|
||||
if agent_id in self._agent_to_prev_action:
|
||||
return _flatten_action(self._agent_to_prev_action[agent_id])
|
||||
else:
|
||||
# We're at t=0, so return all zeros.
|
||||
return np.zeros_like(self.last_action_for(agent_id))
|
||||
|
||||
def prev_reward_for(self, agent_id):
|
||||
"""Returns the previous reward for the specified agent."""
|
||||
|
||||
history = self._agent_reward_history[agent_id]
|
||||
if len(history) >= 2:
|
||||
return history[-2]
|
||||
else:
|
||||
# We're at t=0, so there is no previous reward, just return zero.
|
||||
return 0.0
|
||||
|
||||
def rnn_state_for(self, agent_id):
|
||||
"""Returns the last RNN state for the specified agent."""
|
||||
@@ -105,6 +121,7 @@ class MultiAgentEpisode(object):
|
||||
self.agent_rewards[agent_id,
|
||||
self.policy_for(agent_id)] += reward
|
||||
self.total_reward += reward
|
||||
self._agent_reward_history[agent_id].append(reward)
|
||||
|
||||
def _set_rnn_state(self, agent_id, rnn_state):
|
||||
self._agent_to_rnn_state[agent_id] = rnn_state
|
||||
@@ -117,3 +134,16 @@ class MultiAgentEpisode(object):
|
||||
|
||||
def _set_last_pi_info(self, agent_id, pi_info):
|
||||
self._agent_to_last_pi_info[agent_id] = pi_info
|
||||
|
||||
|
||||
def _flatten_action(action):
|
||||
# Concatenate tuple actions
|
||||
if isinstance(action, list) or isinstance(action, tuple):
|
||||
expanded = []
|
||||
for a in action:
|
||||
if not hasattr(a, "shape") or len(a.shape) == 0:
|
||||
expanded.append(np.expand_dims(a, 1))
|
||||
else:
|
||||
expanded.append(a)
|
||||
action = np.concatenate(expanded, axis=0).flatten()
|
||||
return action
|
||||
|
||||
@@ -11,8 +11,6 @@ from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.env.async_vector_env import AsyncVectorEnv
|
||||
from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari
|
||||
from ray.rllib.env.env_context import EnvContext
|
||||
from ray.rllib.env.serving_env import ServingEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.env.multi_agent_env import MultiAgentEnv
|
||||
from ray.rllib.evaluation.interface import EvaluatorInterface
|
||||
from ray.rllib.evaluation.sample_batch import MultiAgentBatch, \
|
||||
@@ -179,11 +177,15 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
self.compress_observations = compress_observations
|
||||
|
||||
self.env = env_creator(env_context)
|
||||
if isinstance(self.env, VectorEnv) or \
|
||||
isinstance(self.env, ServingEnv) or \
|
||||
isinstance(self.env, MultiAgentEnv) or \
|
||||
if isinstance(self.env, MultiAgentEnv) or \
|
||||
isinstance(self.env, AsyncVectorEnv):
|
||||
|
||||
if model_config.get("custom_preprocessor"):
|
||||
raise ValueError(
|
||||
"Custom preprocessors are not supported for env types "
|
||||
"MultiAgentEnv and AsyncVectorEnv. Please preprocess "
|
||||
"observations in your env directly.")
|
||||
|
||||
def wrap(env):
|
||||
return env # we can't auto-wrap these env types
|
||||
elif is_atari(self.env) and \
|
||||
@@ -294,6 +296,18 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
merged_conf = policy_config.copy()
|
||||
merged_conf.update(conf)
|
||||
with tf.variable_scope(name):
|
||||
if isinstance(obs_space, gym.spaces.Dict):
|
||||
raise ValueError(
|
||||
"Found raw Dict space as input to policy graph. "
|
||||
"Please preprocess your environment observations "
|
||||
"with DictFlatteningPreprocessor and set the "
|
||||
"obs space to `preprocessor.observation_space`.")
|
||||
elif isinstance(obs_space, gym.spaces.Tuple):
|
||||
raise ValueError(
|
||||
"Found raw Tuple space as input to policy graph. "
|
||||
"Please preprocess your environment observations "
|
||||
"with TupleFlatteningPreprocessor and set the "
|
||||
"obs space to `preprocessor.observation_space`.")
|
||||
policy_map[name] = cls(obs_space, act_space, merged_conf)
|
||||
return policy_map
|
||||
|
||||
|
||||
@@ -40,6 +40,8 @@ class PolicyGraph(object):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
"""Compute actions for the current policy.
|
||||
@@ -47,6 +49,8 @@ class PolicyGraph(object):
|
||||
Arguments:
|
||||
obs_batch (np.ndarray): batch of observations
|
||||
state_batches (list): list of RNN state input batches, if any
|
||||
prev_action_batch (np.ndarray): batch of previous action values
|
||||
prev_reward_batch (np.ndarray): batch of previous rewards
|
||||
is_training (bool): whether we are training the policy
|
||||
episodes (list): MultiAgentEpisode for each obs in obs_batch.
|
||||
This provides access to all of the internal episode state,
|
||||
@@ -65,6 +69,8 @@ class PolicyGraph(object):
|
||||
def compute_single_action(self,
|
||||
obs,
|
||||
state,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episode=None):
|
||||
"""Unbatched version of compute_actions.
|
||||
@@ -72,6 +78,8 @@ class PolicyGraph(object):
|
||||
Arguments:
|
||||
obs (obj): single observation
|
||||
state_batches (list): list of RNN state inputs, if any
|
||||
prev_action_batch (np.ndarray): batch of previous action values
|
||||
prev_reward_batch (np.ndarray): batch of previous rewards
|
||||
is_training (bool): whether we are training the policy
|
||||
episode (MultiAgentEpisode): this provides access to all of the
|
||||
internal episode state, which may be useful for model-based or
|
||||
|
||||
@@ -3,22 +3,25 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import defaultdict, namedtuple
|
||||
import numpy as np
|
||||
import six.moves.queue as queue
|
||||
import threading
|
||||
|
||||
from ray.rllib.evaluation.episode import MultiAgentEpisode
|
||||
from ray.rllib.evaluation.episode import MultiAgentEpisode, _flatten_action
|
||||
from ray.rllib.evaluation.sample_batch import MultiAgentSampleBatchBuilder, \
|
||||
MultiAgentBatch
|
||||
from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph
|
||||
from ray.rllib.env.async_vector_env import AsyncVectorEnv
|
||||
from ray.rllib.env.atari_wrappers import get_wrapper_by_cls, MonitorEnv
|
||||
from ray.rllib.models.action_dist import TupleActions
|
||||
from ray.rllib.utils.tf_run_builder import TFRunBuilder
|
||||
|
||||
RolloutMetrics = namedtuple(
|
||||
"RolloutMetrics", ["episode_length", "episode_reward", "agent_rewards"])
|
||||
|
||||
PolicyEvalData = namedtuple("PolicyEvalData",
|
||||
["env_id", "agent_id", "obs", "rnn_state"])
|
||||
PolicyEvalData = namedtuple(
|
||||
"PolicyEvalData",
|
||||
["env_id", "agent_id", "obs", "rnn_state", "prev_action", "prev_reward"])
|
||||
|
||||
|
||||
class SyncSampler(object):
|
||||
@@ -281,7 +284,9 @@ def _env_runner(async_vector_env,
|
||||
if not agent_done:
|
||||
to_eval[policy_id].append(
|
||||
PolicyEvalData(env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id)))
|
||||
episode.rnn_state_for(agent_id),
|
||||
episode.last_action_for(agent_id),
|
||||
rewards[env_id][agent_id] or 0.0))
|
||||
|
||||
last_observation = episode.last_observation_for(agent_id)
|
||||
episode._set_last_observation(agent_id, filtered_obs)
|
||||
@@ -297,6 +302,8 @@ def _env_runner(async_vector_env,
|
||||
obs=last_observation,
|
||||
actions=episode.last_action_for(agent_id),
|
||||
rewards=rewards[env_id][agent_id],
|
||||
prev_actions=episode.prev_action_for(agent_id),
|
||||
prev_rewards=episode.prev_reward_for(agent_id),
|
||||
dones=agent_done,
|
||||
infos=infos[env_id][agent_id],
|
||||
new_obs=filtered_obs,
|
||||
@@ -326,12 +333,17 @@ def _env_runner(async_vector_env,
|
||||
episode = active_episodes[env_id]
|
||||
for agent_id, raw_obs in resetted_obs.items():
|
||||
policy_id = episode.policy_for(agent_id)
|
||||
policy = _get_or_raise(policies, policy_id)
|
||||
filtered_obs = _get_or_raise(obs_filters,
|
||||
policy_id)(raw_obs)
|
||||
episode._set_last_observation(agent_id, filtered_obs)
|
||||
to_eval[policy_id].append(
|
||||
PolicyEvalData(env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id)))
|
||||
PolicyEvalData(
|
||||
env_id, agent_id, filtered_obs,
|
||||
episode.rnn_state_for(agent_id),
|
||||
np.zeros_like(
|
||||
_flatten_action(
|
||||
policy.action_space.sample())), 0.0))
|
||||
|
||||
# Batch eval policy actions if possible
|
||||
if tf_sess:
|
||||
@@ -350,11 +362,15 @@ def _env_runner(async_vector_env,
|
||||
pending_fetches[policy_id] = policy.build_compute_actions(
|
||||
builder, [t.obs for t in eval_data],
|
||||
rnn_in,
|
||||
prev_action_batch=[t.prev_action for t in eval_data],
|
||||
prev_reward_batch=[t.prev_reward for t in eval_data],
|
||||
is_training=True)
|
||||
else:
|
||||
eval_results[policy_id] = policy.compute_actions(
|
||||
[t.obs for t in eval_data],
|
||||
rnn_in,
|
||||
prev_action_batch=[t.prev_action for t in eval_data],
|
||||
prev_reward_batch=[t.prev_reward for t in eval_data],
|
||||
is_training=True,
|
||||
episodes=[active_episodes[t.env_id] for t in eval_data])
|
||||
if builder:
|
||||
@@ -374,6 +390,7 @@ def _env_runner(async_vector_env,
|
||||
for f_i, column in enumerate(rnn_out_cols):
|
||||
pi_info_cols["state_out_{}".format(f_i)] = column
|
||||
# Save output rows
|
||||
actions = _unbatch_tuple_actions(actions)
|
||||
for i, action in enumerate(actions):
|
||||
env_id = eval_data[i].env_id
|
||||
agent_id = eval_data[i].agent_id
|
||||
@@ -413,6 +430,19 @@ def _fetch_atari_metrics(async_vector_env):
|
||||
return atari_out
|
||||
|
||||
|
||||
def _unbatch_tuple_actions(action_batch):
|
||||
# convert list of batches -> batch of lists
|
||||
if isinstance(action_batch, TupleActions):
|
||||
out = []
|
||||
for j in range(len(action_batch.batches[0])):
|
||||
out.append([
|
||||
action_batch.batches[i][j]
|
||||
for i in range(len(action_batch.batches))
|
||||
])
|
||||
return out
|
||||
return action_batch
|
||||
|
||||
|
||||
def _to_column_format(rnn_state_rows):
|
||||
num_cols = len(rnn_state_rows[0])
|
||||
return [[row[i] for row in rnn_state_rows] for i in range(num_cols)]
|
||||
|
||||
@@ -46,6 +46,8 @@ class TFPolicyGraph(PolicyGraph):
|
||||
loss_inputs,
|
||||
state_inputs=None,
|
||||
state_outputs=None,
|
||||
prev_action_input=None,
|
||||
prev_reward_input=None,
|
||||
seq_lens=None,
|
||||
max_seq_len=20):
|
||||
"""Initialize the policy graph.
|
||||
@@ -65,6 +67,8 @@ class TFPolicyGraph(PolicyGraph):
|
||||
and has shape [BATCH_SIZE, data...].
|
||||
state_inputs (list): list of RNN state input Tensors.
|
||||
state_outputs (list): list of RNN state output Tensors.
|
||||
prev_action_input (Tensor): placeholder for previous actions
|
||||
prev_reward_input (Tensor): placeholder for previous rewards
|
||||
seq_lens (Tensor): placeholder for RNN sequence lengths, of shape
|
||||
[NUM_SEQUENCES]. Note that NUM_SEQUENCES << BATCH_SIZE. See
|
||||
models/lstm.py for more information.
|
||||
@@ -75,6 +79,8 @@ class TFPolicyGraph(PolicyGraph):
|
||||
self.action_space = action_space
|
||||
self._sess = sess
|
||||
self._obs_input = obs_input
|
||||
self._prev_action_input = prev_action_input
|
||||
self._prev_reward_input = prev_reward_input
|
||||
self._sampler = action_sampler
|
||||
self._loss = loss
|
||||
self._loss_inputs = loss_inputs
|
||||
@@ -112,6 +118,8 @@ class TFPolicyGraph(PolicyGraph):
|
||||
builder,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
state_batches = state_batches or []
|
||||
@@ -121,6 +129,10 @@ class TFPolicyGraph(PolicyGraph):
|
||||
builder.add_feed_dict({self._obs_input: obs_batch})
|
||||
if state_batches:
|
||||
builder.add_feed_dict({self._seq_lens: np.ones(len(obs_batch))})
|
||||
if self._prev_action_input is not None and prev_action_batch:
|
||||
builder.add_feed_dict({self._prev_action_input: prev_action_batch})
|
||||
if self._prev_reward_input is not None and prev_reward_batch:
|
||||
builder.add_feed_dict({self._prev_reward_input: prev_reward_batch})
|
||||
builder.add_feed_dict({self._is_training: is_training})
|
||||
builder.add_feed_dict(dict(zip(self._state_inputs, state_batches)))
|
||||
fetches = builder.add_fetches([self._sampler] + self._state_outputs +
|
||||
@@ -130,11 +142,14 @@ class TFPolicyGraph(PolicyGraph):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
builder = TFRunBuilder(self._sess, "compute_actions")
|
||||
fetches = self.build_compute_actions(builder, obs_batch, state_batches,
|
||||
is_training)
|
||||
prev_action_batch,
|
||||
prev_reward_batch, is_training)
|
||||
return builder.get(fetches)
|
||||
|
||||
def _get_loss_inputs_dict(self, batch):
|
||||
|
||||
@@ -70,6 +70,8 @@ class TorchPolicyGraph(PolicyGraph):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
if state_batches:
|
||||
|
||||
@@ -20,6 +20,7 @@ class CarlaModel(Model):
|
||||
further fully connected layers.
|
||||
"""
|
||||
|
||||
# TODO(ekl): use build_layers_v2 for native dict space support
|
||||
def _build_layers(self, inputs, num_outputs, options):
|
||||
# Parse options
|
||||
image_shape = options["custom_options"]["image_shape"]
|
||||
|
||||
@@ -14,6 +14,7 @@ import numpy as np
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=200)
|
||||
parser.add_argument("--use-prev-action-reward", action="store_true")
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
|
||||
|
||||
@@ -183,10 +184,13 @@ if __name__ == "__main__":
|
||||
"stop": {
|
||||
"episode_reward_mean": args.stop
|
||||
},
|
||||
"config": dict(configs[args.run], **{
|
||||
"model": {
|
||||
"use_lstm": True,
|
||||
},
|
||||
}),
|
||||
"config": dict(
|
||||
configs[args.run], **{
|
||||
"model": {
|
||||
"use_lstm": True,
|
||||
"lstm_use_prev_action_reward": args.
|
||||
use_prev_action_reward,
|
||||
},
|
||||
}),
|
||||
}
|
||||
})
|
||||
|
||||
@@ -2,9 +2,11 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import namedtuple
|
||||
import distutils.version
|
||||
|
||||
import tensorflow as tf
|
||||
import numpy as np
|
||||
import distutils.version
|
||||
|
||||
use_tf150_api = (distutils.version.LooseVersion(tf.VERSION) >=
|
||||
distutils.version.LooseVersion("1.5.0"))
|
||||
@@ -225,4 +227,8 @@ class MultiActionDistribution(ActionDistribution):
|
||||
|
||||
def sample(self):
|
||||
"""Draw a sample from the action distribution."""
|
||||
return [[s.sample() for s in self.child_distributions]]
|
||||
|
||||
return TupleActions([s.sample() for s in self.child_distributions])
|
||||
|
||||
|
||||
TupleActions = namedtuple("TupleActions", ["batches"])
|
||||
|
||||
@@ -10,6 +10,9 @@ from functools import partial
|
||||
from ray.tune.registry import RLLIB_MODEL, RLLIB_PREPROCESSOR, \
|
||||
_global_registry
|
||||
|
||||
from ray.rllib.env.async_vector_env import _ServingEnvToAsync
|
||||
from ray.rllib.env.serving_env import ServingEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.models.action_dist import (
|
||||
Categorical, Deterministic, DiagGaussian, MultiActionDistribution,
|
||||
squash_to_range)
|
||||
@@ -41,6 +44,8 @@ MODEL_DEFAULTS = {
|
||||
"max_seq_len": 20,
|
||||
# Size of the LSTM cell
|
||||
"lstm_cell_size": 256,
|
||||
# Whether to feed a_{t-1}, r_{t-1} to LSTM
|
||||
"lstm_use_prev_action_reward": False,
|
||||
|
||||
# == Atari ==
|
||||
# Whether to enable framestack for Atari envs
|
||||
@@ -133,10 +138,6 @@ class ModelCatalog(object):
|
||||
action_placeholder (Tensor): A placeholder for the actions
|
||||
"""
|
||||
|
||||
# TODO(ekl) are list spaces valid?
|
||||
if isinstance(action_space, list):
|
||||
action_space = gym.spaces.Tuple(action_space)
|
||||
|
||||
if isinstance(action_space, gym.spaces.Box):
|
||||
return tf.placeholder(
|
||||
tf.float32, shape=(None, action_space.shape[0]), name="action")
|
||||
@@ -160,11 +161,18 @@ class ModelCatalog(object):
|
||||
" not supported".format(action_space))
|
||||
|
||||
@staticmethod
|
||||
def get_model(inputs, num_outputs, options, state_in=None, seq_lens=None):
|
||||
def get_model(input_dict,
|
||||
obs_space,
|
||||
num_outputs,
|
||||
options,
|
||||
state_in=None,
|
||||
seq_lens=None):
|
||||
"""Returns a suitable model conforming to given input and output specs.
|
||||
|
||||
Args:
|
||||
inputs (Tensor): The input tensor to the model.
|
||||
input_dict (dict): Dict of input tensors to the model, including
|
||||
the observation under the "obs" key.
|
||||
obs_space (Space): Observation space of the target gym env.
|
||||
num_outputs (int): The size of the output vector of the model.
|
||||
options (dict): Optional args to pass to the model constructor.
|
||||
state_in (list): Optional RNN state in tensors.
|
||||
@@ -174,34 +182,40 @@ class ModelCatalog(object):
|
||||
model (Model): Neural network model.
|
||||
"""
|
||||
|
||||
assert isinstance(input_dict, dict)
|
||||
options = options or MODEL_DEFAULTS
|
||||
model = ModelCatalog._get_model(inputs, num_outputs, options, state_in,
|
||||
seq_lens)
|
||||
model = ModelCatalog._get_model(input_dict, obs_space, num_outputs,
|
||||
options, state_in, seq_lens)
|
||||
|
||||
if options.get("use_lstm"):
|
||||
model = LSTM(model.last_layer, num_outputs, options, state_in,
|
||||
copy = dict(input_dict)
|
||||
copy["obs"] = model.last_layer
|
||||
model = LSTM(copy, obs_space, num_outputs, options, state_in,
|
||||
seq_lens)
|
||||
|
||||
return model
|
||||
|
||||
@staticmethod
|
||||
def _get_model(inputs, num_outputs, options, state_in, seq_lens):
|
||||
def _get_model(input_dict, obs_space, num_outputs, options, state_in,
|
||||
seq_lens):
|
||||
if options.get("custom_model"):
|
||||
model = options["custom_model"]
|
||||
print("Using custom model {}".format(model))
|
||||
return _global_registry.get(RLLIB_MODEL, model)(
|
||||
inputs,
|
||||
input_dict,
|
||||
obs_space,
|
||||
num_outputs,
|
||||
options,
|
||||
state_in=state_in,
|
||||
seq_lens=seq_lens)
|
||||
|
||||
obs_rank = len(inputs.shape) - 1
|
||||
obs_rank = len(input_dict["obs"].shape) - 1
|
||||
|
||||
if obs_rank > 1:
|
||||
return VisionNetwork(inputs, num_outputs, options)
|
||||
return VisionNetwork(input_dict, obs_space, num_outputs, options)
|
||||
|
||||
return FullyConnectedNetwork(inputs, num_outputs, options)
|
||||
return FullyConnectedNetwork(input_dict, obs_space, num_outputs,
|
||||
options)
|
||||
|
||||
@staticmethod
|
||||
def get_torch_model(input_shape, num_outputs, options=None):
|
||||
@@ -243,7 +257,7 @@ class ModelCatalog(object):
|
||||
"""Returns a suitable processor for the given environment.
|
||||
|
||||
Args:
|
||||
env (gym.Env): The gym environment to preprocess.
|
||||
env (gym.Env|VectorEnv|ServingEnv): The environment to wrap.
|
||||
options (dict): Options to pass to the preprocessor.
|
||||
|
||||
Returns:
|
||||
@@ -269,16 +283,23 @@ class ModelCatalog(object):
|
||||
"""Returns a preprocessor as a gym observation wrapper.
|
||||
|
||||
Args:
|
||||
env (gym.Env): The gym environment to wrap.
|
||||
env (gym.Env|VectorEnv|ServingEnv): The environment to wrap.
|
||||
options (dict): Options to pass to the preprocessor.
|
||||
|
||||
Returns:
|
||||
wrapper (gym.ObservationWrapper): Preprocessor in wrapper form.
|
||||
env (RLlib env): Wrapped environment
|
||||
"""
|
||||
|
||||
options = options or MODEL_DEFAULTS
|
||||
preprocessor = ModelCatalog.get_preprocessor(env, options)
|
||||
return _RLlibPreprocessorWrapper(env, preprocessor)
|
||||
if isinstance(env, gym.Env):
|
||||
return _RLlibPreprocessorWrapper(env, preprocessor)
|
||||
elif isinstance(env, VectorEnv):
|
||||
return _RLlibVectorPreprocessorWrapper(env, preprocessor)
|
||||
elif isinstance(env, ServingEnv):
|
||||
return _ServingEnvToAsync(env, preprocessor)
|
||||
else:
|
||||
raise ValueError("Don't know how to wrap {}".format(env))
|
||||
|
||||
@staticmethod
|
||||
def register_custom_preprocessor(preprocessor_name, preprocessor_class):
|
||||
@@ -314,10 +335,32 @@ class _RLlibPreprocessorWrapper(gym.ObservationWrapper):
|
||||
def __init__(self, env, preprocessor):
|
||||
super(_RLlibPreprocessorWrapper, self).__init__(env)
|
||||
self.preprocessor = preprocessor
|
||||
|
||||
from gym.spaces.box import Box
|
||||
self.observation_space = Box(
|
||||
-1.0, 1.0, preprocessor.shape, dtype=np.float32)
|
||||
self.observation_space = preprocessor.observation_space
|
||||
|
||||
def observation(self, observation):
|
||||
return self.preprocessor.transform(observation)
|
||||
|
||||
|
||||
class _RLlibVectorPreprocessorWrapper(VectorEnv):
|
||||
"""Preprocessing wrapper for vector envs."""
|
||||
|
||||
def __init__(self, env, preprocessor):
|
||||
self.env = env
|
||||
self.prep = preprocessor
|
||||
self.action_space = env.action_space
|
||||
self.observation_space = preprocessor.observation_space
|
||||
self.num_envs = env.num_envs
|
||||
|
||||
def vector_reset(self):
|
||||
return [self.prep.transform(obs) for obs in self.env.vector_reset()]
|
||||
|
||||
def reset_at(self, index):
|
||||
return self.prep.transform(self.env.reset_at(index))
|
||||
|
||||
def vector_step(self, actions):
|
||||
obs, rewards, dones, infos = self.env.vector_step(actions)
|
||||
obs = [self.prep.transform(o) for o in obs]
|
||||
return obs, rewards, dones, infos
|
||||
|
||||
def get_unwrapped(self):
|
||||
return self.env.get_unwrapped()
|
||||
|
||||
@@ -13,6 +13,12 @@ class FullyConnectedNetwork(Model):
|
||||
"""Generic fully connected network."""
|
||||
|
||||
def _build_layers(self, inputs, num_outputs, options):
|
||||
"""Process the flattened inputs.
|
||||
|
||||
Note that dict inputs will be flattened into a vector. To define a
|
||||
model that processes the components separately, use _build_layers_v2().
|
||||
"""
|
||||
|
||||
hiddens = options.get("fcnet_hiddens")
|
||||
activation = get_activation_fn(options.get("fcnet_activation"))
|
||||
|
||||
|
||||
@@ -9,6 +9,10 @@ the LSTM cell, we reshape the input to add the expected time dimension. During
|
||||
postprocessing, we dynamically pad the experience batches so that this
|
||||
reshaping is possible.
|
||||
|
||||
Note that this padding strategy only works out if we assume zero inputs don't
|
||||
meaningfully affect the loss function. This happens to be true for all the
|
||||
current algorithms: https://github.com/ray-project/ray/issues/2992
|
||||
|
||||
See the add_time_dimension() and chop_into_sequences() functions below for
|
||||
more info.
|
||||
"""
|
||||
@@ -134,9 +138,24 @@ class LSTM(Model):
|
||||
self.seq_lens. See add_time_dimension() for more information.
|
||||
"""
|
||||
|
||||
def _build_layers(self, inputs, num_outputs, options):
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
cell_size = options.get("lstm_cell_size")
|
||||
last_layer = add_time_dimension(inputs, self.seq_lens)
|
||||
if options.get("lstm_use_prev_action_reward"):
|
||||
action_dim = int(
|
||||
np.product(
|
||||
input_dict["prev_actions"].get_shape().as_list()[1:]))
|
||||
features = tf.concat(
|
||||
[
|
||||
input_dict["obs"],
|
||||
tf.reshape(
|
||||
tf.cast(input_dict["prev_actions"], tf.float32),
|
||||
[-1, action_dim]),
|
||||
tf.reshape(input_dict["prev_rewards"], [-1, 1]),
|
||||
],
|
||||
axis=1)
|
||||
else:
|
||||
features = input_dict["obs"]
|
||||
last_layer = add_time_dimension(features, self.seq_lens)
|
||||
|
||||
# Setup the LSTM cell
|
||||
lstm = rnn.BasicLSTMCell(cell_size, state_is_tuple=True)
|
||||
|
||||
@@ -2,8 +2,13 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import gym
|
||||
import tensorflow as tf
|
||||
|
||||
from ray.rllib.models.preprocessors import get_preprocessor
|
||||
|
||||
|
||||
class Model(object):
|
||||
"""Defines an abstract network model for use with RLlib.
|
||||
@@ -16,12 +21,12 @@ class Model(object):
|
||||
needs to further post-processing (e.g. Actor and Critic networks in A3C).
|
||||
|
||||
Attributes:
|
||||
inputs (Tensor): The input placeholder for this model, of shape
|
||||
[BATCH_SIZE, ...].
|
||||
input_dict (dict): Dictionary of input tensors, including "obs",
|
||||
"prev_action", "prev_reward".
|
||||
outputs (Tensor): The output vector of this model, of shape
|
||||
[BATCH_SIZE, num_outputs].
|
||||
last_layer (Tensor): The network layer right before the model output,
|
||||
of shape [BATCH_SIZE, N].
|
||||
last_layer (Tensor): The feature layer right before the model output,
|
||||
of shape [BATCH_SIZE, f].
|
||||
state_init (list): List of initial recurrent state tensors (if any).
|
||||
state_in (list): List of input recurrent state tensors (if any).
|
||||
state_out (list): List of output recurrent state tensors (if any).
|
||||
@@ -38,12 +43,13 @@ class Model(object):
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
inputs,
|
||||
input_dict,
|
||||
obs_space,
|
||||
num_outputs,
|
||||
options,
|
||||
state_in=None,
|
||||
seq_lens=None):
|
||||
self.inputs = inputs
|
||||
assert isinstance(input_dict, dict), input_dict
|
||||
|
||||
# Default attribute values for the non-RNN case
|
||||
self.state_init = []
|
||||
@@ -58,8 +64,26 @@ class Model(object):
|
||||
if options.get("free_log_std"):
|
||||
assert num_outputs % 2 == 0
|
||||
num_outputs = num_outputs // 2
|
||||
self.outputs, self.last_layer = self._build_layers(
|
||||
inputs, num_outputs, options)
|
||||
try:
|
||||
self.outputs, self.last_layer = self._build_layers_v2(
|
||||
_restore_original_dimensions(input_dict, obs_space),
|
||||
num_outputs, options)
|
||||
except NotImplementedError:
|
||||
self.outputs, self.last_layer = self._build_layers(
|
||||
input_dict["obs"], num_outputs, options)
|
||||
|
||||
# Validate the output shape
|
||||
try:
|
||||
out = tf.convert_to_tensor(self.outputs)
|
||||
shape = out.shape.as_list()
|
||||
except Exception:
|
||||
raise ValueError("Output is not a tensor: {}".format(self.outputs))
|
||||
else:
|
||||
if len(shape) != 2 or shape[1] != num_outputs:
|
||||
raise ValueError(
|
||||
"Expected output shape of [None, {}], got {}".format(
|
||||
num_outputs, shape))
|
||||
|
||||
if options.get("free_log_std", False):
|
||||
log_std = tf.get_variable(
|
||||
name="log_std",
|
||||
@@ -68,6 +92,80 @@ class Model(object):
|
||||
self.outputs = tf.concat(
|
||||
[self.outputs, 0.0 * self.outputs + log_std], 1)
|
||||
|
||||
def _build_layers(self):
|
||||
"""Builds and returns the output and last layer of the network."""
|
||||
def _build_layers(self, inputs, num_outputs, options):
|
||||
"""Builds and returns the output and last layer of the network.
|
||||
|
||||
Deprecated: use _build_layers_v2 instead, which has better support
|
||||
for dict and tuple spaces.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
"""Define the layers of a custom model.
|
||||
|
||||
Arguments:
|
||||
input_dict (dict): Dictionary of input tensors, including "obs",
|
||||
"prev_action", "prev_reward".
|
||||
num_outputs (int): Output tensor must be of size
|
||||
[BATCH_SIZE, num_outputs].
|
||||
options (dict): Model options.
|
||||
|
||||
Returns:
|
||||
(outputs, feature_layer): Tensors of size [BATCH_SIZE, num_outputs]
|
||||
and [BATCH_SIZE, desired_feature_size].
|
||||
|
||||
When using dict or tuple observation spaces, you can access
|
||||
the nested sub-observation batches here as well:
|
||||
|
||||
Examples:
|
||||
>>> print(input_dict)
|
||||
{'prev_actions': <tf.Tensor shape=(?,) dtype=int64>,
|
||||
'prev_rewards': <tf.Tensor shape=(?,) dtype=float32>,
|
||||
'obs': OrderedDict([
|
||||
('sensors', OrderedDict([
|
||||
('front_cam', [
|
||||
<tf.Tensor shape=(?, 10, 10, 3) dtype=float32>,
|
||||
<tf.Tensor shape=(?, 10, 10, 3) dtype=float32>]),
|
||||
('position', <tf.Tensor shape=(?, 3) dtype=float32>),
|
||||
('velocity', <tf.Tensor shape=(?, 3) dtype=float32>)]))])}
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def _restore_original_dimensions(input_dict, obs_space):
|
||||
if hasattr(obs_space, "original_space"):
|
||||
return dict(
|
||||
input_dict,
|
||||
obs=_unpack_obs(input_dict["obs"], obs_space.original_space))
|
||||
return input_dict
|
||||
|
||||
|
||||
def _unpack_obs(obs, space):
|
||||
if (isinstance(space, gym.spaces.Dict)
|
||||
or isinstance(space, gym.spaces.Tuple)):
|
||||
prep = get_preprocessor(space)(space)
|
||||
if len(obs.shape) != 2 or obs.shape[1] != prep.shape[0]:
|
||||
raise ValueError(
|
||||
"Expected flattened obs shape of [None, {}], got {}".format(
|
||||
prep.shape[0], obs.shape))
|
||||
assert len(prep.preprocessors) == len(space.spaces), \
|
||||
(len(prep.preprocessors) == len(space.spaces))
|
||||
offset = 0
|
||||
if isinstance(space, gym.spaces.Tuple):
|
||||
u = []
|
||||
for p, v in zip(prep.preprocessors, space.spaces):
|
||||
obs_slice = obs[:, offset:offset + p.size]
|
||||
offset += p.size
|
||||
u.append(
|
||||
_unpack_obs(
|
||||
tf.reshape(obs_slice, [-1] + list(p.shape)), v))
|
||||
else:
|
||||
u = OrderedDict()
|
||||
for p, (k, v) in zip(prep.preprocessors, space.spaces.items()):
|
||||
obs_slice = obs[:, offset:offset + p.size]
|
||||
offset += p.size
|
||||
u[k] = _unpack_obs(
|
||||
tf.reshape(obs_slice, [-1] + list(p.shape)), v)
|
||||
return u
|
||||
else:
|
||||
return obs
|
||||
|
||||
@@ -16,19 +16,34 @@ class Preprocessor(object):
|
||||
shape (obj): Shape of the preprocessed output.
|
||||
"""
|
||||
|
||||
def __init__(self, obs_space, options):
|
||||
def __init__(self, obs_space, options=None):
|
||||
legacy_patch_shapes(obs_space)
|
||||
self._obs_space = obs_space
|
||||
self._options = options
|
||||
self._init()
|
||||
self._options = options or {}
|
||||
self.shape = self._init_shape(obs_space, options)
|
||||
|
||||
def _init(self):
|
||||
pass
|
||||
def _init_shape(self, obs_space, options):
|
||||
"""Returns the shape after preprocessing."""
|
||||
raise NotImplementedError
|
||||
|
||||
def transform(self, observation):
|
||||
"""Returns the preprocessed observation."""
|
||||
raise NotImplementedError
|
||||
|
||||
@property
|
||||
def size(self):
|
||||
return int(np.product(self.shape))
|
||||
|
||||
@property
|
||||
def observation_space(self):
|
||||
obs_space = gym.spaces.Box(-1.0, 1.0, self.shape, dtype=np.float32)
|
||||
# Stash the unwrapped space so that we can unwrap dict and tuple spaces
|
||||
# automatically in model.py
|
||||
if (isinstance(self, TupleFlatteningPreprocessor)
|
||||
or isinstance(self, DictFlatteningPreprocessor)):
|
||||
obs_space.original_space = self._obs_space
|
||||
return obs_space
|
||||
|
||||
|
||||
class GenericPixelPreprocessor(Preprocessor):
|
||||
"""Generic image preprocessor.
|
||||
@@ -37,19 +52,20 @@ class GenericPixelPreprocessor(Preprocessor):
|
||||
instead for deepmind-style Atari preprocessing.
|
||||
"""
|
||||
|
||||
def _init(self):
|
||||
self._grayscale = self._options.get("grayscale")
|
||||
self._zero_mean = self._options.get("zero_mean")
|
||||
self._dim = self._options.get("dim")
|
||||
self._channel_major = self._options.get("channel_major")
|
||||
def _init_shape(self, obs_space, options):
|
||||
self._grayscale = options.get("grayscale")
|
||||
self._zero_mean = options.get("zero_mean")
|
||||
self._dim = options.get("dim")
|
||||
self._channel_major = options.get("channel_major")
|
||||
if self._grayscale:
|
||||
self.shape = (self._dim, self._dim, 1)
|
||||
shape = (self._dim, self._dim, 1)
|
||||
else:
|
||||
self.shape = (self._dim, self._dim, 3)
|
||||
shape = (self._dim, self._dim, 3)
|
||||
|
||||
# channel_major requires (# in-channels, row dim, col dim)
|
||||
if self._channel_major:
|
||||
self.shape = self.shape[-1:] + self.shape[:-1]
|
||||
shape = shape[-1:] + shape[:-1]
|
||||
return shape
|
||||
|
||||
def transform(self, observation):
|
||||
"""Downsamples images from (210, 160, 3) by the configured factor."""
|
||||
@@ -75,16 +91,16 @@ class GenericPixelPreprocessor(Preprocessor):
|
||||
|
||||
|
||||
class AtariRamPreprocessor(Preprocessor):
|
||||
def _init(self):
|
||||
self.shape = (128, )
|
||||
def _init_shape(self, obs_space, options):
|
||||
return (128, )
|
||||
|
||||
def transform(self, observation):
|
||||
return (observation - 128) / 128
|
||||
|
||||
|
||||
class OneHotPreprocessor(Preprocessor):
|
||||
def _init(self):
|
||||
self.shape = (self._obs_space.n, )
|
||||
def _init_shape(self, obs_space, options):
|
||||
return (self._obs_space.n, )
|
||||
|
||||
def transform(self, observation):
|
||||
arr = np.zeros(self._obs_space.n)
|
||||
@@ -93,8 +109,8 @@ class OneHotPreprocessor(Preprocessor):
|
||||
|
||||
|
||||
class NoPreprocessor(Preprocessor):
|
||||
def _init(self):
|
||||
self.shape = self._obs_space.shape
|
||||
def _init_shape(self, obs_space, options):
|
||||
return self._obs_space.shape
|
||||
|
||||
def transform(self, observation):
|
||||
return observation
|
||||
@@ -103,11 +119,10 @@ class NoPreprocessor(Preprocessor):
|
||||
class TupleFlatteningPreprocessor(Preprocessor):
|
||||
"""Preprocesses each tuple element, then flattens it all into a vector.
|
||||
|
||||
If desired, the vector output can be unpacked via tf.reshape() within a
|
||||
custom model to handle each component separately.
|
||||
RLlib models will unpack the flattened output before _build_layers_v2().
|
||||
"""
|
||||
|
||||
def _init(self):
|
||||
def _init_shape(self, obs_space, options):
|
||||
assert isinstance(self._obs_space, gym.spaces.Tuple)
|
||||
size = 0
|
||||
self.preprocessors = []
|
||||
@@ -116,17 +131,43 @@ class TupleFlatteningPreprocessor(Preprocessor):
|
||||
print("Creating sub-preprocessor for", space)
|
||||
preprocessor = get_preprocessor(space)(space, self._options)
|
||||
self.preprocessors.append(preprocessor)
|
||||
size += np.product(preprocessor.shape)
|
||||
self.shape = (size, )
|
||||
size += preprocessor.size
|
||||
return (size, )
|
||||
|
||||
def transform(self, observation):
|
||||
assert len(observation) == len(self.preprocessors), observation
|
||||
return np.concatenate([
|
||||
np.reshape(p.transform(o), [np.product(p.shape)])
|
||||
np.reshape(p.transform(o), [p.size])
|
||||
for (o, p) in zip(observation, self.preprocessors)
|
||||
])
|
||||
|
||||
|
||||
class DictFlatteningPreprocessor(Preprocessor):
|
||||
"""Preprocesses each dict value, then flattens it all into a vector.
|
||||
|
||||
RLlib models will unpack the flattened output before _build_layers_v2().
|
||||
"""
|
||||
|
||||
def _init_shape(self, obs_space, options):
|
||||
assert isinstance(self._obs_space, gym.spaces.Dict)
|
||||
size = 0
|
||||
self.preprocessors = []
|
||||
for space in self._obs_space.spaces.values():
|
||||
print("Creating sub-preprocessor for", space)
|
||||
preprocessor = get_preprocessor(space)(space, self._options)
|
||||
self.preprocessors.append(preprocessor)
|
||||
size += preprocessor.size
|
||||
return (size, )
|
||||
|
||||
def transform(self, observation):
|
||||
assert len(observation) == len(self.preprocessors), \
|
||||
(len(observation), len(self.preprocessors))
|
||||
return np.concatenate([
|
||||
np.reshape(p.transform(o), [p.size])
|
||||
for (o, p) in zip(observation.values(), self.preprocessors)
|
||||
])
|
||||
|
||||
|
||||
def get_preprocessor(space):
|
||||
"""Returns an appropriate preprocessor class for the given space."""
|
||||
|
||||
@@ -141,6 +182,8 @@ def get_preprocessor(space):
|
||||
preprocessor = AtariRamPreprocessor
|
||||
elif isinstance(space, gym.spaces.Tuple):
|
||||
preprocessor = TupleFlatteningPreprocessor
|
||||
elif isinstance(space, gym.spaces.Dict):
|
||||
preprocessor = DictFlatteningPreprocessor
|
||||
else:
|
||||
preprocessor = NoPreprocessor
|
||||
|
||||
|
||||
@@ -12,7 +12,8 @@ from ray.rllib.models.misc import get_activation_fn, flatten
|
||||
class VisionNetwork(Model):
|
||||
"""Generic vision network."""
|
||||
|
||||
def _build_layers(self, inputs, num_outputs, options):
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
inputs = input_dict["obs"]
|
||||
filters = options.get("conv_filters")
|
||||
if not filters:
|
||||
filters = get_filter_config(options)
|
||||
|
||||
@@ -15,16 +15,18 @@ from ray.rllib.models.visionnet import VisionNetwork
|
||||
|
||||
|
||||
class CustomPreprocessor(Preprocessor):
|
||||
pass
|
||||
def _init_shape(self, obs_space, options):
|
||||
return None
|
||||
|
||||
|
||||
class CustomPreprocessor2(Preprocessor):
|
||||
pass
|
||||
def _init_shape(self, obs_space, options):
|
||||
return None
|
||||
|
||||
|
||||
class CustomModel(Model):
|
||||
def _build_layers(self, *args):
|
||||
return None, None
|
||||
return tf.constant([[0] * 5]), None
|
||||
|
||||
|
||||
class ModelCatalogTest(unittest.TestCase):
|
||||
@@ -69,20 +71,24 @@ class ModelCatalogTest(unittest.TestCase):
|
||||
ray.init()
|
||||
|
||||
with tf.variable_scope("test1"):
|
||||
p1 = ModelCatalog.get_model(
|
||||
np.zeros((10, 3), dtype=np.float32), 5, {})
|
||||
p1 = ModelCatalog.get_model({
|
||||
"obs": np.zeros((10, 3), dtype=np.float32)
|
||||
}, Box(0, 1, shape=(3, ), dtype=np.float32), 5, {})
|
||||
self.assertEqual(type(p1), FullyConnectedNetwork)
|
||||
|
||||
with tf.variable_scope("test2"):
|
||||
p2 = ModelCatalog.get_model(
|
||||
np.zeros((10, 84, 84, 3), dtype=np.float32), 5, {})
|
||||
p2 = ModelCatalog.get_model({
|
||||
"obs": np.zeros((10, 84, 84, 3), dtype=np.float32)
|
||||
}, Box(0, 1, shape=(84, 84, 3), dtype=np.float32), 5, {})
|
||||
self.assertEqual(type(p2), VisionNetwork)
|
||||
|
||||
def testCustomModel(self):
|
||||
ray.init()
|
||||
ModelCatalog.register_custom_model("foo", CustomModel)
|
||||
p1 = ModelCatalog.get_model(
|
||||
tf.constant([1, 2, 3]), 5, {"custom_model": "foo"})
|
||||
p1 = ModelCatalog.get_model({
|
||||
"obs": tf.constant([1, 2, 3])
|
||||
}, Box(0, 1, shape=(3, ), dtype=np.float32), 5,
|
||||
{"custom_model": "foo"})
|
||||
self.assertEqual(str(type(p1)), str(CustomModel))
|
||||
|
||||
|
||||
|
||||
@@ -314,6 +314,8 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
return [0] * len(obs_batch), [[h] * len(obs_batch)], {}
|
||||
@@ -337,6 +339,8 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
# Pretend we did a model-based rollout and want to return
|
||||
|
||||
@@ -0,0 +1,249 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import pickle
|
||||
|
||||
from gym import spaces
|
||||
from gym.envs.registration import EnvSpec
|
||||
import gym
|
||||
import tensorflow.contrib.slim as slim
|
||||
import tensorflow as tf
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.pg import PGAgent
|
||||
from ray.rllib.env.async_vector_env import AsyncVectorEnv
|
||||
from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.model import Model
|
||||
from ray.rllib.test.test_serving_env import SimpleServing
|
||||
from ray.tune.registry import register_env
|
||||
|
||||
DICT_SPACE = spaces.Dict({
|
||||
"sensors": spaces.Dict({
|
||||
"position": spaces.Box(low=-100, high=100, shape=(3, )),
|
||||
"velocity": spaces.Box(low=-1, high=1, shape=(3, )),
|
||||
"front_cam": spaces.Tuple(
|
||||
(spaces.Box(low=0, high=1, shape=(10, 10, 3)),
|
||||
spaces.Box(low=0, high=1, shape=(10, 10, 3)))),
|
||||
"rear_cam": spaces.Box(low=0, high=1, shape=(10, 10, 3)),
|
||||
}),
|
||||
"inner_state": spaces.Dict({
|
||||
"charge": spaces.Discrete(100),
|
||||
"job_status": spaces.Dict({
|
||||
"task": spaces.Discrete(5),
|
||||
"progress": spaces.Box(low=0, high=100, shape=()),
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
DICT_SAMPLES = [DICT_SPACE.sample() for _ in range(10)]
|
||||
|
||||
TUPLE_SPACE = spaces.Tuple([
|
||||
spaces.Box(low=-100, high=100, shape=(3, )),
|
||||
spaces.Tuple((spaces.Box(low=0, high=1, shape=(10, 10, 3)),
|
||||
spaces.Box(low=0, high=1, shape=(10, 10, 3)))),
|
||||
spaces.Discrete(5),
|
||||
])
|
||||
|
||||
TUPLE_SAMPLES = [TUPLE_SPACE.sample() for _ in range(10)]
|
||||
|
||||
|
||||
def one_hot(i, n):
|
||||
out = [0.0] * n
|
||||
out[i] = 1.0
|
||||
return out
|
||||
|
||||
|
||||
class NestedDictEnv(gym.Env):
|
||||
def __init__(self):
|
||||
self.action_space = spaces.Discrete(2)
|
||||
self.observation_space = DICT_SPACE
|
||||
self._spec = EnvSpec("NestedDictEnv-v0")
|
||||
self.steps = 0
|
||||
|
||||
def reset(self):
|
||||
self.steps = 0
|
||||
return DICT_SAMPLES[0]
|
||||
|
||||
def step(self, action):
|
||||
self.steps += 1
|
||||
return DICT_SAMPLES[self.steps], 1, self.steps >= 5, {}
|
||||
|
||||
|
||||
class NestedTupleEnv(gym.Env):
|
||||
def __init__(self):
|
||||
self.action_space = spaces.Discrete(2)
|
||||
self.observation_space = TUPLE_SPACE
|
||||
self._spec = EnvSpec("NestedTupleEnv-v0")
|
||||
self.steps = 0
|
||||
|
||||
def reset(self):
|
||||
self.steps = 0
|
||||
return TUPLE_SAMPLES[0]
|
||||
|
||||
def step(self, action):
|
||||
self.steps += 1
|
||||
return TUPLE_SAMPLES[self.steps], 1, self.steps >= 5, {}
|
||||
|
||||
|
||||
class InvalidModel(Model):
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
return "not", "valid"
|
||||
|
||||
|
||||
class DictSpyModel(Model):
|
||||
capture_index = 0
|
||||
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
def spy(pos, front_cam, task):
|
||||
# TF runs this function in an isolated context, so we have to use
|
||||
# redis to communicate back to our suite
|
||||
ray.experimental.internal_kv._internal_kv_put(
|
||||
"d_spy_in_{}".format(DictSpyModel.capture_index),
|
||||
pickle.dumps((pos, front_cam, task)))
|
||||
DictSpyModel.capture_index += 1
|
||||
return 0
|
||||
|
||||
spy_fn = tf.py_func(
|
||||
spy, [
|
||||
input_dict["obs"]["sensors"]["position"],
|
||||
input_dict["obs"]["sensors"]["front_cam"][0],
|
||||
input_dict["obs"]["inner_state"]["job_status"]["task"]
|
||||
],
|
||||
tf.int64,
|
||||
stateful=True)
|
||||
|
||||
with tf.control_dependencies([spy_fn]):
|
||||
output = slim.fully_connected(
|
||||
input_dict["obs"]["sensors"]["position"], num_outputs)
|
||||
return output, output
|
||||
|
||||
|
||||
class TupleSpyModel(Model):
|
||||
capture_index = 0
|
||||
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
def spy(pos, cam, task):
|
||||
# TF runs this function in an isolated context, so we have to use
|
||||
# redis to communicate back to our suite
|
||||
ray.experimental.internal_kv._internal_kv_put(
|
||||
"t_spy_in_{}".format(TupleSpyModel.capture_index),
|
||||
pickle.dumps((pos, cam, task)))
|
||||
TupleSpyModel.capture_index += 1
|
||||
return 0
|
||||
|
||||
spy_fn = tf.py_func(
|
||||
spy, [
|
||||
input_dict["obs"][0],
|
||||
input_dict["obs"][1][0],
|
||||
input_dict["obs"][2],
|
||||
],
|
||||
tf.int64,
|
||||
stateful=True)
|
||||
|
||||
with tf.control_dependencies([spy_fn]):
|
||||
output = slim.fully_connected(input_dict["obs"][0], num_outputs)
|
||||
return output, output
|
||||
|
||||
|
||||
class NestedSpacesTest(unittest.TestCase):
|
||||
def testInvalidModel(self):
|
||||
ModelCatalog.register_custom_model("invalid", InvalidModel)
|
||||
self.assertRaises(ValueError, lambda: PGAgent(
|
||||
env="CartPole-v0", config={
|
||||
"model": {
|
||||
"custom_model": "invalid",
|
||||
},
|
||||
}))
|
||||
|
||||
def doTestNestedDict(self, make_env):
|
||||
ModelCatalog.register_custom_model("composite", DictSpyModel)
|
||||
register_env("nested", make_env)
|
||||
pg = PGAgent(
|
||||
env="nested",
|
||||
config={
|
||||
"num_workers": 0,
|
||||
"sample_batch_size": 5,
|
||||
"model": {
|
||||
"custom_model": "composite",
|
||||
},
|
||||
})
|
||||
pg.train()
|
||||
|
||||
# Check that the model sees the correct reconstructed observations
|
||||
for i in range(4):
|
||||
seen = pickle.loads(
|
||||
ray.experimental.internal_kv._internal_kv_get(
|
||||
"d_spy_in_{}".format(i)))
|
||||
pos_i = DICT_SAMPLES[i]["sensors"]["position"].tolist()
|
||||
cam_i = DICT_SAMPLES[i]["sensors"]["front_cam"][0].tolist()
|
||||
task_i = one_hot(
|
||||
DICT_SAMPLES[i]["inner_state"]["job_status"]["task"], 5)
|
||||
self.assertEqual(seen[0][0].tolist(), pos_i)
|
||||
self.assertEqual(seen[1][0].tolist(), cam_i)
|
||||
self.assertEqual(seen[2][0].tolist(), task_i)
|
||||
|
||||
def doTestNestedTuple(self, make_env):
|
||||
ModelCatalog.register_custom_model("composite2", TupleSpyModel)
|
||||
register_env("nested2", make_env)
|
||||
pg = PGAgent(
|
||||
env="nested2",
|
||||
config={
|
||||
"num_workers": 0,
|
||||
"sample_batch_size": 5,
|
||||
"model": {
|
||||
"custom_model": "composite2",
|
||||
},
|
||||
})
|
||||
pg.train()
|
||||
|
||||
# Check that the model sees the correct reconstructed observations
|
||||
for i in range(4):
|
||||
seen = pickle.loads(
|
||||
ray.experimental.internal_kv._internal_kv_get(
|
||||
"t_spy_in_{}".format(i)))
|
||||
pos_i = TUPLE_SAMPLES[i][0].tolist()
|
||||
cam_i = TUPLE_SAMPLES[i][1][0].tolist()
|
||||
task_i = one_hot(TUPLE_SAMPLES[i][2], 5)
|
||||
self.assertEqual(seen[0][0].tolist(), pos_i)
|
||||
self.assertEqual(seen[1][0].tolist(), cam_i)
|
||||
self.assertEqual(seen[2][0].tolist(), task_i)
|
||||
|
||||
def testNestedDictGym(self):
|
||||
self.doTestNestedDict(lambda _: NestedDictEnv())
|
||||
|
||||
def testNestedDictVector(self):
|
||||
self.doTestNestedDict(
|
||||
lambda _: VectorEnv.wrap(lambda i: NestedDictEnv()))
|
||||
|
||||
def testNestedDictServing(self):
|
||||
self.doTestNestedDict(lambda _: SimpleServing(NestedDictEnv()))
|
||||
|
||||
def testNestedDictAsync(self):
|
||||
self.assertRaisesRegexp(
|
||||
ValueError, "Found raw Dict space.*",
|
||||
lambda: self.doTestNestedDict(
|
||||
lambda _: AsyncVectorEnv.wrap_async(NestedDictEnv())))
|
||||
|
||||
def testNestedTupleGym(self):
|
||||
self.doTestNestedTuple(lambda _: NestedTupleEnv())
|
||||
|
||||
def testNestedTupleVector(self):
|
||||
self.doTestNestedTuple(
|
||||
lambda _: VectorEnv.wrap(lambda i: NestedTupleEnv()))
|
||||
|
||||
def testNestedTupleServing(self):
|
||||
self.doTestNestedTuple(lambda _: SimpleServing(NestedTupleEnv()))
|
||||
|
||||
def testNestedTupleAsync(self):
|
||||
self.assertRaisesRegexp(
|
||||
ValueError, "Found raw Tuple space.*",
|
||||
lambda: self.doTestNestedTuple(
|
||||
lambda _: AsyncVectorEnv.wrap_async(NestedTupleEnv())))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init(num_cpus=5)
|
||||
unittest.main(verbosity=2)
|
||||
@@ -3,6 +3,7 @@ from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import gym
|
||||
import numpy as np
|
||||
import time
|
||||
import unittest
|
||||
|
||||
@@ -21,6 +22,8 @@ class MockPolicyGraph(PolicyGraph):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
return [0] * len(obs_batch), [], {}
|
||||
@@ -33,6 +36,8 @@ class BadPolicyGraph(PolicyGraph):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
is_training=False,
|
||||
episodes=None):
|
||||
raise Exception("intentional error")
|
||||
@@ -107,8 +112,23 @@ class TestPolicyEvaluator(unittest.TestCase):
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
policy_graph=MockPolicyGraph)
|
||||
batch = ev.sample()
|
||||
for key in ["obs", "actions", "rewards", "dones", "advantages"]:
|
||||
for key in [
|
||||
"obs", "actions", "rewards", "dones", "advantages",
|
||||
"prev_rewards", "prev_actions"
|
||||
]:
|
||||
self.assertIn(key, batch)
|
||||
|
||||
def to_prev(vec):
|
||||
out = np.zeros_like(vec)
|
||||
for i, v in enumerate(vec):
|
||||
if i + 1 < len(out) and not batch["dones"][i]:
|
||||
out[i + 1] = v
|
||||
return out.tolist()
|
||||
|
||||
self.assertEqual(batch["prev_rewards"].tolist(),
|
||||
to_prev(batch["rewards"]))
|
||||
self.assertEqual(batch["prev_actions"].tolist(),
|
||||
to_prev(batch["actions"]))
|
||||
self.assertGreater(batch["advantages"][0], 1)
|
||||
|
||||
def testGlobalVarsUpdate(self):
|
||||
|
||||
@@ -2,7 +2,7 @@ import unittest
|
||||
import traceback
|
||||
|
||||
import gym
|
||||
from gym.spaces import Box, Discrete, Tuple
|
||||
from gym.spaces import Box, Discrete, Tuple, Dict
|
||||
from gym.envs.registration import EnvSpec
|
||||
import numpy as np
|
||||
import sys
|
||||
@@ -14,33 +14,28 @@ from ray.tune.registry import register_env
|
||||
|
||||
ACTION_SPACES_TO_TEST = {
|
||||
"discrete": Discrete(5),
|
||||
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
"simple_tuple": Tuple([
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32)
|
||||
]),
|
||||
"mixed_tuple": Tuple(
|
||||
"vector": Box(-1.0, 1.0, (5, ), dtype=np.float32),
|
||||
"tuple": Tuple(
|
||||
[Discrete(2),
|
||||
Discrete(3),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32)]),
|
||||
Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
|
||||
}
|
||||
|
||||
OBSERVATION_SPACES_TO_TEST = {
|
||||
"discrete": Discrete(5),
|
||||
"vector": Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
"image": Box(0.0, 1.0, (84, 84, 1), dtype=np.float32),
|
||||
"atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32),
|
||||
"atari_ram": Box(0.0, 1.0, (128, ), dtype=np.float32),
|
||||
"simple_tuple": Tuple([
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32),
|
||||
Box(0.0, 1.0, (5, ), dtype=np.float32)
|
||||
]),
|
||||
"mixed_tuple": Tuple(
|
||||
[Discrete(10), Box(0.0, 1.0, (5, ), dtype=np.float32)]),
|
||||
"vector": Box(-1.0, 1.0, (5, ), dtype=np.float32),
|
||||
"image": Box(-1.0, 1.0, (84, 84, 1), dtype=np.float32),
|
||||
"atari": Box(-1.0, 1.0, (210, 160, 3), dtype=np.float32),
|
||||
"tuple": Tuple([Discrete(10),
|
||||
Box(-1.0, 1.0, (5, ), dtype=np.float32)]),
|
||||
"dict": Dict({
|
||||
"task": Discrete(10),
|
||||
"position": Box(-1.0, 1.0, (5, ), dtype=np.float32),
|
||||
}),
|
||||
}
|
||||
|
||||
|
||||
def make_stub_env(action_space, obs_space):
|
||||
def make_stub_env(action_space, obs_space, check_action_bounds):
|
||||
class StubEnv(gym.Env):
|
||||
def __init__(self):
|
||||
self.action_space = action_space
|
||||
@@ -52,16 +47,23 @@ def make_stub_env(action_space, obs_space):
|
||||
return sample
|
||||
|
||||
def step(self, action):
|
||||
if check_action_bounds and not self.action_space.contains(action):
|
||||
raise ValueError("Illegal action for {}: {}".format(
|
||||
self.action_space, action))
|
||||
if (isinstance(self.action_space, Tuple)
|
||||
and len(action) != len(self.action_space.spaces)):
|
||||
raise ValueError("Illegal action for {}: {}".format(
|
||||
self.action_space, action))
|
||||
return self.observation_space.sample(), 1, True, {}
|
||||
|
||||
return StubEnv
|
||||
|
||||
|
||||
def check_support(alg, config, stats):
|
||||
def check_support(alg, config, stats, check_bounds=False):
|
||||
for a_name, action_space in ACTION_SPACES_TO_TEST.items():
|
||||
for o_name, obs_space in OBSERVATION_SPACES_TO_TEST.items():
|
||||
print("=== Testing", alg, action_space, obs_space, "===")
|
||||
stub_env = make_stub_env(action_space, obs_space)
|
||||
stub_env = make_stub_env(action_space, obs_space, check_bounds)
|
||||
register_env("stub_env", lambda c: stub_env())
|
||||
stat = "ok"
|
||||
a = None
|
||||
@@ -105,8 +107,13 @@ class ModelSupportedSpaces(unittest.TestCase):
|
||||
"num_sgd_iter": 1,
|
||||
"train_batch_size": 10,
|
||||
"sample_batch_size": 10,
|
||||
"sgd_minibatch_size": 1
|
||||
}, stats)
|
||||
"sgd_minibatch_size": 1,
|
||||
"model": {
|
||||
"squash_to_range": True
|
||||
},
|
||||
},
|
||||
stats,
|
||||
check_bounds=True)
|
||||
check_support(
|
||||
"ES", {
|
||||
"num_workers": 1,
|
||||
|
||||
@@ -26,7 +26,8 @@ class TFRunBuilder(object):
|
||||
def add_feed_dict(self, feed_dict):
|
||||
assert not self._executed
|
||||
for k in feed_dict:
|
||||
assert k not in self.feed_dict
|
||||
if k in self.feed_dict:
|
||||
raise ValueError("Key added twice: {}".format(k))
|
||||
self.feed_dict.update(feed_dict)
|
||||
|
||||
def add_fetches(self, fetches):
|
||||
|
||||
Reference in New Issue
Block a user