mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:00:36 +08:00
[RLlib] Tf2x preparation; part 2 (upgrading try_import_tf()). (#9136)
* WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT.
This commit is contained in:
@@ -4,7 +4,7 @@ import numpy as np
|
||||
from ray.rllib.utils import force_list
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
def unflatten(vector, shapes):
|
||||
@@ -79,24 +79,29 @@ class TensorFlowVariables:
|
||||
variable_names.append(tf_obj.node_def.name)
|
||||
self.variables = OrderedDict()
|
||||
variable_list = [
|
||||
v for v in tf.global_variables()
|
||||
v for v in tf1.global_variables()
|
||||
if v.op.node_def.name in variable_names
|
||||
]
|
||||
if input_variables is not None:
|
||||
variable_list += input_variables
|
||||
for v in variable_list:
|
||||
self.variables[v.op.node_def.name] = v
|
||||
|
||||
self.placeholders = {}
|
||||
self.assignment_nodes = {}
|
||||
if not tf1.executing_eagerly():
|
||||
for v in variable_list:
|
||||
self.variables[v.op.node_def.name] = v
|
||||
|
||||
# Create new placeholders to put in custom weights.
|
||||
for k, var in self.variables.items():
|
||||
self.placeholders[k] = tf.placeholder(
|
||||
var.value().dtype,
|
||||
var.get_shape().as_list(),
|
||||
name="Placeholder_" + k)
|
||||
self.assignment_nodes[k] = var.assign(self.placeholders[k])
|
||||
self.placeholders = {}
|
||||
self.assignment_nodes = {}
|
||||
|
||||
# Create new placeholders to put in custom weights.
|
||||
for k, var in self.variables.items():
|
||||
self.placeholders[k] = tf1.placeholder(
|
||||
var.value().dtype,
|
||||
var.get_shape().as_list(),
|
||||
name="Placeholder_" + k)
|
||||
self.assignment_nodes[k] = var.assign(self.placeholders[k])
|
||||
else:
|
||||
for v in variable_list:
|
||||
self.variables[v.name] = v
|
||||
|
||||
def set_session(self, sess):
|
||||
"""Sets the current session used by the class.
|
||||
@@ -117,10 +122,12 @@ class TensorFlowVariables:
|
||||
|
||||
def _check_sess(self):
|
||||
"""Checks if the session is set, and if not throw an error message."""
|
||||
assert self.sess is not None, ("The session is not set. Set the "
|
||||
"session either by passing it into the "
|
||||
"TensorFlowVariables constructor or by "
|
||||
"calling set_session(sess).")
|
||||
if tf1.executing_eagerly():
|
||||
return
|
||||
assert self.sess is not None, \
|
||||
"The session is not set. Set the session either by passing it " \
|
||||
"into the TensorFlowVariables constructor or by calling " \
|
||||
"set_session(sess)."
|
||||
|
||||
def get_flat(self):
|
||||
"""Gets the weights and returns them as a flat array.
|
||||
@@ -129,6 +136,11 @@ class TensorFlowVariables:
|
||||
1D Array containing the flattened weights.
|
||||
"""
|
||||
self._check_sess()
|
||||
# Eager mode.
|
||||
if not self.sess:
|
||||
return np.concatenate(
|
||||
[v.numpy().flatten() for v in self.variables.values()])
|
||||
# Graph mode.
|
||||
return np.concatenate([
|
||||
v.eval(session=self.sess).flatten()
|
||||
for v in self.variables.values()
|
||||
@@ -147,12 +159,16 @@ class TensorFlowVariables:
|
||||
self._check_sess()
|
||||
shapes = [v.get_shape().as_list() for v in self.variables.values()]
|
||||
arrays = unflatten(new_weights, shapes)
|
||||
placeholders = [
|
||||
self.placeholders[k] for k, v in self.variables.items()
|
||||
]
|
||||
self.sess.run(
|
||||
list(self.assignment_nodes.values()),
|
||||
feed_dict=dict(zip(placeholders, arrays)))
|
||||
if not self.sess:
|
||||
for v, a in zip(self.variables.values(), arrays):
|
||||
v.assign(a)
|
||||
else:
|
||||
placeholders = [
|
||||
self.placeholders[k] for k, v in self.variables.items()
|
||||
]
|
||||
self.sess.run(
|
||||
list(self.assignment_nodes.values()),
|
||||
feed_dict=dict(zip(placeholders, arrays)))
|
||||
|
||||
def get_weights(self):
|
||||
"""Returns a dictionary containing the weights of the network.
|
||||
@@ -161,6 +177,10 @@ class TensorFlowVariables:
|
||||
Dictionary mapping variable names to their weights.
|
||||
"""
|
||||
self._check_sess()
|
||||
# Eager mode.
|
||||
if not self.sess:
|
||||
return self.variables
|
||||
# Graph mode.
|
||||
return self.sess.run(self.variables)
|
||||
|
||||
def set_weights(self, new_weights):
|
||||
|
||||
@@ -344,6 +344,7 @@ py_test(
|
||||
args = ["--yaml-dir=tuned_examples/sac", "--torch"]
|
||||
)
|
||||
|
||||
|
||||
# TD3
|
||||
py_test(
|
||||
name = "run_regression_tests_pendulum_td3_tf",
|
||||
@@ -1013,6 +1014,13 @@ py_test(
|
||||
srcs = ["models/tests/test_distributions.py"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "test_attention_nets",
|
||||
tags = ["models"],
|
||||
size = "small",
|
||||
srcs = ["models/tests/test_attention_nets.py"]
|
||||
)
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
# Optimizers and Memories
|
||||
# rllib/execution/
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.policy.tf_policy import LearningRateSchedule
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class A3CLoss:
|
||||
|
||||
@@ -13,7 +13,7 @@ from ray.rllib.utils.filter import get_filter
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.spaces.space_utils import unbatch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class ARSTFPolicy:
|
||||
@@ -29,8 +29,8 @@ class ARSTFPolicy:
|
||||
self.single_threaded = config.get("single_threaded", False)
|
||||
self.sess = make_session(single_threaded=self.single_threaded)
|
||||
|
||||
self.inputs = tf.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
self.inputs = tf1.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
|
||||
# Policy network.
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
@@ -52,7 +52,7 @@ class ARSTFPolicy:
|
||||
self.num_params = sum(
|
||||
np.prod(variable.shape.as_list())
|
||||
for _, variable in self.variables.variables.items())
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
def compute_actions(self,
|
||||
observation,
|
||||
|
||||
@@ -3,7 +3,7 @@ import numpy as np
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class DDPGTFModel(TFModelV2):
|
||||
|
||||
@@ -22,7 +22,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \
|
||||
make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -126,18 +126,18 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)
|
||||
|
||||
# Policy network evaluation.
|
||||
with tf.variable_scope(POLICY_SCOPE, reuse=True):
|
||||
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
with tf1.variable_scope(POLICY_SCOPE, reuse=True):
|
||||
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
policy_t = model.get_policy_output(model_out_t)
|
||||
# policy_batchnorm_update_ops = list(
|
||||
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
|
||||
with tf.variable_scope(POLICY_TARGET_SCOPE):
|
||||
with tf1.variable_scope(POLICY_TARGET_SCOPE):
|
||||
policy_tp1 = \
|
||||
policy.target_model.get_policy_output(target_model_out_tp1)
|
||||
|
||||
# Action outputs.
|
||||
with tf.variable_scope(ACTION_SCOPE, reuse=True):
|
||||
with tf1.variable_scope(ACTION_SCOPE, reuse=True):
|
||||
if policy.config["smooth_target_policy"]:
|
||||
target_noise_clip = policy.config["target_noise_clip"]
|
||||
clipped_normal_sample = tf.clip_by_value(
|
||||
@@ -154,29 +154,29 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
policy_tp1_smoothed = policy_tp1
|
||||
|
||||
# Q-net(s) evaluation.
|
||||
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
with tf.variable_scope(Q_SCOPE):
|
||||
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
with tf1.variable_scope(Q_SCOPE):
|
||||
# Q-values for given actions & observations in given current
|
||||
q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])
|
||||
|
||||
with tf.variable_scope(Q_SCOPE, reuse=True):
|
||||
with tf1.variable_scope(Q_SCOPE, reuse=True):
|
||||
# Q-values for current policy (no noise) in given current state
|
||||
q_t_det_policy = model.get_q_values(model_out_t, policy_t)
|
||||
|
||||
if twin_q:
|
||||
with tf.variable_scope(TWIN_Q_SCOPE):
|
||||
with tf1.variable_scope(TWIN_Q_SCOPE):
|
||||
twin_q_t = model.get_twin_q_values(
|
||||
model_out_t, train_batch[SampleBatch.ACTIONS])
|
||||
# q_batchnorm_update_ops = list(
|
||||
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
|
||||
# Target q-net(s) evaluation.
|
||||
with tf.variable_scope(Q_TARGET_SCOPE):
|
||||
with tf1.variable_scope(Q_TARGET_SCOPE):
|
||||
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
|
||||
policy_tp1_smoothed)
|
||||
|
||||
if twin_q:
|
||||
with tf.variable_scope(TWIN_Q_TARGET_SCOPE):
|
||||
with tf1.variable_scope(TWIN_Q_TARGET_SCOPE):
|
||||
twin_q_tp1 = policy.target_model.get_twin_q_values(
|
||||
target_model_out_tp1, policy_tp1_smoothed)
|
||||
|
||||
@@ -220,10 +220,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
if l2_reg is not None:
|
||||
for var in policy.model.policy_variables():
|
||||
if "bias" not in var.name:
|
||||
actor_loss += (l2_reg * tf.nn.l2_loss(var))
|
||||
actor_loss += (l2_reg * tf1.nn.l2_loss(var))
|
||||
for var in policy.model.q_variables():
|
||||
if "bias" not in var.name:
|
||||
critic_loss += (l2_reg * tf.nn.l2_loss(var))
|
||||
critic_loss += (l2_reg * tf1.nn.l2_loss(var))
|
||||
|
||||
# Model self-supervised losses.
|
||||
if policy.config["use_state_preprocessor"]:
|
||||
@@ -259,9 +259,9 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
|
||||
def make_ddpg_optimizers(policy, config):
|
||||
# Create separate optimizers for actor & critic losses.
|
||||
policy._actor_optimizer = tf.train.AdamOptimizer(
|
||||
policy._actor_optimizer = tf1.train.AdamOptimizer(
|
||||
learning_rate=config["actor_lr"])
|
||||
policy._critic_optimizer = tf.train.AdamOptimizer(
|
||||
policy._critic_optimizer = tf1.train.AdamOptimizer(
|
||||
learning_rate=config["critic_lr"])
|
||||
return None
|
||||
|
||||
@@ -286,7 +286,7 @@ def build_apply_op(policy, optimizer, grads_and_vars):
|
||||
# For policy gradient, update policy net one time v.s.
|
||||
# update critic net `policy_delay` time(s).
|
||||
should_apply_actor_opt = tf.equal(
|
||||
tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
|
||||
tf.math.floormod(policy.global_step, policy.config["policy_delay"]), 0)
|
||||
|
||||
def make_apply_op():
|
||||
return policy._actor_optimizer.apply_gradients(
|
||||
@@ -299,7 +299,7 @@ def build_apply_op(policy, optimizer, grads_and_vars):
|
||||
critic_op = policy._critic_optimizer.apply_gradients(
|
||||
policy._critic_grads_and_vars)
|
||||
# Increment global step & apply ops.
|
||||
with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
|
||||
with tf1.control_dependencies([tf1.assign_add(policy.global_step, 1)]):
|
||||
return tf.group(actor_op, critic_op)
|
||||
|
||||
|
||||
@@ -341,7 +341,7 @@ def build_ddpg_stats(policy, batch):
|
||||
|
||||
def before_init_fn(policy, obs_space, action_space, config):
|
||||
# Create global step for counting the number of update operations.
|
||||
policy.global_step = tf.train.get_or_create_global_step()
|
||||
policy.global_step = tf1.train.get_or_create_global_step()
|
||||
|
||||
|
||||
class ComputeTDErrorMixin:
|
||||
|
||||
@@ -49,10 +49,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)
|
||||
|
||||
# Policy network evaluation.
|
||||
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
policy_t = model.get_policy_output(model_out_t)
|
||||
# policy_batchnorm_update_ops = list(
|
||||
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
|
||||
policy_tp1 = \
|
||||
policy.target_model.get_policy_output(target_model_out_tp1)
|
||||
@@ -73,7 +73,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
policy_tp1_smoothed = policy_tp1
|
||||
|
||||
# Q-net(s) evaluation.
|
||||
# prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
# prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
# Q-values for given actions & observations in given current
|
||||
q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])
|
||||
|
||||
@@ -86,7 +86,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
|
||||
twin_q_t = model.get_twin_q_values(model_out_t,
|
||||
train_batch[SampleBatch.ACTIONS])
|
||||
# q_batchnorm_update_ops = list(
|
||||
# set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
# set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
|
||||
|
||||
# Target q-net(s) evaluation.
|
||||
q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
|
||||
|
||||
@@ -4,7 +4,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
_, tf, _ = try_import_tf()
|
||||
|
||||
|
||||
class NoopModel(TFModelV2):
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check, check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class TestTD3(unittest.TestCase):
|
||||
@@ -32,8 +32,9 @@ class TestTD3(unittest.TestCase):
|
||||
|
||||
# Test against all frameworks.
|
||||
for _ in framework_iterator(config, frameworks="tf"):
|
||||
lcl_config = config.copy()
|
||||
# Default GaussianNoise setup.
|
||||
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
|
||||
trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
|
||||
# Setting explore=False should always return the same action.
|
||||
a_ = trainer.compute_action(obs, explore=False)
|
||||
for _ in range(50):
|
||||
@@ -44,9 +45,10 @@ class TestTD3(unittest.TestCase):
|
||||
for _ in range(50):
|
||||
actions.append(trainer.compute_action(obs))
|
||||
check(np.std(actions), 0.0, false=True)
|
||||
trainer.stop()
|
||||
|
||||
# Check randomness at beginning.
|
||||
config["exploration_config"] = {
|
||||
lcl_config["exploration_config"] = {
|
||||
# Act randomly at beginning ...
|
||||
"random_timesteps": 30,
|
||||
# Then act very closely to deterministic actions thereafter.
|
||||
@@ -54,7 +56,7 @@ class TestTD3(unittest.TestCase):
|
||||
"initial_scale": 0.001,
|
||||
"final_scale": 0.001,
|
||||
}
|
||||
trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
|
||||
trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
|
||||
# ts=1 (get a deterministic action as per explore=False).
|
||||
deterministic_action = trainer.compute_action(obs, explore=False)
|
||||
# ts=2-5 (in random window).
|
||||
@@ -73,6 +75,7 @@ class TestTD3(unittest.TestCase):
|
||||
for _ in range(50):
|
||||
a = trainer.compute_action(obs, explore=False)
|
||||
check(a, deterministic_action)
|
||||
trainer.stop()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -3,7 +3,7 @@ import numpy as np
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class DistributionalQTFModel(TFModelV2):
|
||||
@@ -155,7 +155,7 @@ class DistributionalQTFModel(TFModelV2):
|
||||
units=num_atoms, activation=None)(state_out)
|
||||
return state_score
|
||||
|
||||
if tf.executing_eagerly():
|
||||
if tf1.executing_eagerly():
|
||||
from tensorflow.python.ops import variable_scope
|
||||
# Have to use a variable store to reuse variables in eager mode
|
||||
store = variable_scope.EagerVariableStore()
|
||||
@@ -163,30 +163,32 @@ class DistributionalQTFModel(TFModelV2):
|
||||
# Save the scope objects, since in eager we will execute this
|
||||
# path repeatedly and there is no guarantee it will always be run
|
||||
# in the same original scope.
|
||||
with tf.variable_scope(name + "/action_value") as action_scope:
|
||||
with tf1.variable_scope(name + "/action_value") as action_scope:
|
||||
pass
|
||||
with tf.variable_scope(name + "/state_value") as state_scope:
|
||||
with tf1.variable_scope(name + "/state_value") as state_scope:
|
||||
pass
|
||||
|
||||
def build_action_value_in_scope(model_out):
|
||||
with store.as_default():
|
||||
with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(
|
||||
action_scope, reuse=tf1.AUTO_REUSE):
|
||||
return build_action_value(model_out)
|
||||
|
||||
def build_state_score_in_scope(model_out):
|
||||
with store.as_default():
|
||||
with tf.variable_scope(state_scope, reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(
|
||||
state_scope, reuse=tf1.AUTO_REUSE):
|
||||
return build_state_score(model_out)
|
||||
else:
|
||||
|
||||
def build_action_value_in_scope(model_out):
|
||||
with tf.variable_scope(
|
||||
name + "/action_value", reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(
|
||||
name + "/action_value", reuse=tf1.AUTO_REUSE):
|
||||
return build_action_value(model_out)
|
||||
|
||||
def build_state_score_in_scope(model_out):
|
||||
with tf.variable_scope(
|
||||
name + "/state_value", reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(
|
||||
name + "/state_value", reuse=tf1.AUTO_REUSE):
|
||||
return build_state_score(model_out)
|
||||
|
||||
q_out = build_action_value_in_scope(self.model_out)
|
||||
@@ -241,33 +243,33 @@ class DistributionalQTFModel(TFModelV2):
|
||||
epsilon_w = tf.matmul(
|
||||
a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
|
||||
epsilon_b = epsilon_out
|
||||
sigma_w = tf.get_variable(
|
||||
sigma_w = tf1.get_variable(
|
||||
name=prefix + "_sigma_w",
|
||||
shape=[in_size, out_size],
|
||||
dtype=tf.float32,
|
||||
initializer=tf.random_uniform_initializer(
|
||||
initializer=tf1.random_uniform_initializer(
|
||||
minval=-1.0 / np.sqrt(float(in_size)),
|
||||
maxval=1.0 / np.sqrt(float(in_size))))
|
||||
# TF noise generation can be unreliable on GPU
|
||||
# If generating the noise on the CPU,
|
||||
# lowering sigma0 to 0.1 may be helpful
|
||||
sigma_b = tf.get_variable(
|
||||
sigma_b = tf1.get_variable(
|
||||
name=prefix + "_sigma_b",
|
||||
shape=[out_size],
|
||||
dtype=tf.float32, # 0.5~GPU, 0.1~CPU
|
||||
initializer=tf.constant_initializer(
|
||||
initializer=tf1.constant_initializer(
|
||||
sigma0 / np.sqrt(float(in_size))))
|
||||
|
||||
w = tf.get_variable(
|
||||
w = tf1.get_variable(
|
||||
name=prefix + "_fc_w",
|
||||
shape=[in_size, out_size],
|
||||
dtype=tf.float32,
|
||||
initializer=tf.initializers.glorot_uniform())
|
||||
b = tf.get_variable(
|
||||
initializer=tf.initializers.GlorotUniform())
|
||||
b = tf1.get_variable(
|
||||
name=prefix + "_fc_b",
|
||||
shape=[out_size],
|
||||
dtype=tf.float32,
|
||||
initializer=tf.zeros_initializer())
|
||||
initializer=tf.initializers.Zeros())
|
||||
|
||||
action_activation = \
|
||||
tf.keras.layers.Lambda(lambda x: tf.matmul(
|
||||
|
||||
@@ -17,7 +17,7 @@ from ray.rllib.utils.tf_ops import huber_loss, reduce_mean_ignore_inf, \
|
||||
minimize_and_clip
|
||||
from ray.rllib.utils.tf_ops import make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
Q_SCOPE = "q_func"
|
||||
Q_TARGET_SCOPE = "target_q_func"
|
||||
@@ -253,7 +253,7 @@ def build_q_losses(policy, model, _, train_batch):
|
||||
|
||||
|
||||
def adam_optimizer(policy, config):
|
||||
return tf.train.AdamOptimizer(
|
||||
return tf1.train.AdamOptimizer(
|
||||
learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"])
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class SimpleQModel(TFModelV2):
|
||||
|
||||
@@ -15,7 +15,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import huber_loss, make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
Q_SCOPE = "q_func"
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check, check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class TestDQN(unittest.TestCase):
|
||||
|
||||
@@ -11,7 +11,7 @@ from ray.rllib.utils.numpy import fc, one_hot, huber_loss
|
||||
from ray.rllib.utils.test_utils import check, check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class TestSimpleQ(unittest.TestCase):
|
||||
|
||||
@@ -14,7 +14,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space, \
|
||||
unbatch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
tree = try_import_tree()
|
||||
|
||||
|
||||
@@ -60,9 +60,9 @@ def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0.0):
|
||||
|
||||
def make_session(single_threaded):
|
||||
if not single_threaded:
|
||||
return tf.Session()
|
||||
return tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
return tf1.Session()
|
||||
return tf1.Session(
|
||||
config=tf1.ConfigProto(
|
||||
inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))
|
||||
|
||||
|
||||
@@ -77,8 +77,8 @@ class ESTFPolicy:
|
||||
self.preprocessor.shape)
|
||||
self.single_threaded = config.get("single_threaded", False)
|
||||
self.sess = make_session(single_threaded=self.single_threaded)
|
||||
self.inputs = tf.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
self.inputs = tf1.placeholder(tf.float32,
|
||||
[None] + list(self.preprocessor.shape))
|
||||
|
||||
# Policy network.
|
||||
dist_class, dist_dim = ModelCatalog.get_action_dist(
|
||||
@@ -98,7 +98,7 @@ class ESTFPolicy:
|
||||
self.num_params = sum(
|
||||
np.prod(variable.shape.as_list())
|
||||
for _, variable in self.variables.variables.items())
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
def compute_actions(self,
|
||||
observation,
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class TestIMPALA(unittest.TestCase):
|
||||
|
||||
@@ -30,7 +30,7 @@ from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.numpy import softmax
|
||||
from ray.rllib.utils.test_utils import check, framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
@@ -185,20 +185,20 @@ class VtraceTest(unittest.TestCase):
|
||||
# can deal with that.
|
||||
inputs_ = {
|
||||
# T, B, NUM_ACTIONS
|
||||
"behaviour_policy_logits": tf.placeholder(
|
||||
"behaviour_policy_logits": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, None]),
|
||||
# T, B, NUM_ACTIONS
|
||||
"target_policy_logits": tf.placeholder(
|
||||
"target_policy_logits": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, None]),
|
||||
"actions": tf.placeholder(
|
||||
"actions": tf1.placeholder(
|
||||
dtype=tf.int32, shape=[None, None]),
|
||||
"discounts": tf.placeholder(
|
||||
"discounts": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None]),
|
||||
"rewards": tf.placeholder(
|
||||
"rewards": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None]),
|
||||
"values": tf.placeholder(
|
||||
"values": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None]),
|
||||
"bootstrap_value": tf.placeholder(
|
||||
"bootstrap_value": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None]),
|
||||
}
|
||||
else:
|
||||
@@ -282,15 +282,15 @@ class VtraceTest(unittest.TestCase):
|
||||
vtrace = vtrace_tf if fw != "torch" else vtrace_torch
|
||||
if fw == "tf":
|
||||
inputs_ = {
|
||||
"log_rhos": tf.placeholder(
|
||||
"log_rhos": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"discounts": tf.placeholder(
|
||||
"discounts": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"rewards": tf.placeholder(
|
||||
"rewards": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 42]),
|
||||
"values": tf.placeholder(
|
||||
"values": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 42]),
|
||||
"bootstrap_value": tf.placeholder(
|
||||
"bootstrap_value": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, 42])
|
||||
}
|
||||
else:
|
||||
@@ -310,16 +310,16 @@ class VtraceTest(unittest.TestCase):
|
||||
vtrace = vtrace_tf if fw != "torch" else vtrace_torch
|
||||
if fw == "tf":
|
||||
inputs_ = {
|
||||
"log_rhos": tf.placeholder(
|
||||
"log_rhos": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"discounts": tf.placeholder(
|
||||
"discounts": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 1]),
|
||||
"rewards": tf.placeholder(
|
||||
"rewards": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 42]),
|
||||
"values": tf.placeholder(
|
||||
"values": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None, None, 42]),
|
||||
# Should be [None, 42].
|
||||
"bootstrap_value": tf.placeholder(
|
||||
"bootstrap_value": tf1.placeholder(
|
||||
dtype=tf.float32, shape=[None])
|
||||
}
|
||||
else:
|
||||
|
||||
@@ -33,7 +33,7 @@ import collections
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
VTraceFromLogitsReturns = collections.namedtuple("VTraceFromLogitsReturns", [
|
||||
"vs", "pg_advantages", "log_rhos", "behaviour_action_log_probs",
|
||||
@@ -222,7 +222,7 @@ def multi_from_logits(behaviour_policy_logits,
|
||||
behaviour_policy_logits[i].shape.assert_has_rank(3)
|
||||
target_policy_logits[i].shape.assert_has_rank(3)
|
||||
|
||||
with tf.name_scope(
|
||||
with tf1.name_scope(
|
||||
name,
|
||||
values=[
|
||||
behaviour_policy_logits, target_policy_logits, actions,
|
||||
@@ -332,21 +332,22 @@ def from_importance_weights(log_rhos,
|
||||
if clip_pg_rho_threshold is not None:
|
||||
clip_pg_rho_threshold.shape.assert_has_rank(0)
|
||||
|
||||
with tf.name_scope(
|
||||
with tf1.name_scope(
|
||||
name,
|
||||
values=[log_rhos, discounts, rewards, values, bootstrap_value]):
|
||||
rhos = tf.exp(log_rhos)
|
||||
rhos = tf.math.exp(log_rhos)
|
||||
if clip_rho_threshold is not None:
|
||||
clipped_rhos = tf.minimum(
|
||||
clip_rho_threshold, rhos, name="clipped_rhos")
|
||||
|
||||
tf.summary.histogram("clipped_rhos_1000", tf.minimum(1000.0, rhos))
|
||||
tf.summary.scalar(
|
||||
tf1.summary.histogram(
|
||||
"clipped_rhos_1000", tf.minimum(1000.0, rhos))
|
||||
tf1.summary.scalar(
|
||||
"num_of_clipped_rhos",
|
||||
tf.reduce_sum(
|
||||
tf.cast(
|
||||
tf.equal(clipped_rhos, clip_rho_threshold), tf.int32)))
|
||||
tf.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
|
||||
tf1.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
|
||||
else:
|
||||
clipped_rhos = rhos
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ from ray.rllib.policy.tf_policy import LearningRateSchedule, \
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import explained_variance
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -253,10 +253,11 @@ def postprocess_trajectory(policy,
|
||||
|
||||
def choose_optimizer(policy, config):
|
||||
if policy.config["opt_type"] == "adam":
|
||||
return tf.train.AdamOptimizer(policy.cur_lr)
|
||||
return tf1.train.AdamOptimizer(policy.cur_lr)
|
||||
else:
|
||||
return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
|
||||
config["momentum"], config["epsilon"])
|
||||
return tf1.train.RMSPropOptimizer(
|
||||
policy.cur_lr,
|
||||
config["decay"], config["momentum"], config["epsilon"])
|
||||
|
||||
|
||||
def clip_gradients(policy, optimizer, loss):
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
|
||||
vf_preds_fetches, clip_gradients, setup_config, ValueNetworkMixin
|
||||
from ray.rllib.utils.framework import get_activation_fn
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -33,7 +33,7 @@ def PPOLoss(dist_class,
|
||||
pi_new_logp = curr_dist.logp(actions)
|
||||
pi_old_logp = prev_dist.logp(actions)
|
||||
|
||||
logp_ratio = tf.exp(pi_new_logp - pi_old_logp)
|
||||
logp_ratio = tf.math.exp(pi_new_logp - pi_old_logp)
|
||||
if clip_loss:
|
||||
return tf.minimum(
|
||||
advantages * logp_ratio,
|
||||
@@ -49,10 +49,10 @@ def PPOLoss(dist_class,
|
||||
|
||||
def vf_loss(value_fn, value_targets, vf_preds, vf_clip_param=0.1):
|
||||
# GAE Value Function Loss
|
||||
vf_loss1 = tf.square(value_fn - value_targets)
|
||||
vf_loss1 = tf.math.square(value_fn - value_targets)
|
||||
vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
|
||||
-vf_clip_param, vf_clip_param)
|
||||
vf_loss2 = tf.square(vf_clipped - value_targets)
|
||||
vf_loss2 = tf.math.square(vf_clipped - value_targets)
|
||||
vf_loss = tf.maximum(vf_loss1, vf_loss2)
|
||||
return vf_loss
|
||||
|
||||
@@ -104,7 +104,7 @@ class WorkerLoss(object):
|
||||
vf_clip_param=vf_clip_param,
|
||||
vf_loss_coeff=vf_loss_coeff,
|
||||
clip_loss=clip_loss)
|
||||
self.loss = tf.Print(self.loss, ["Worker Adapt Loss", self.loss])
|
||||
self.loss = tf1.Print(self.loss, ["Worker Adapt Loss", self.loss])
|
||||
|
||||
|
||||
# This is the Meta-Update computation graph for main (meta-update step)
|
||||
@@ -230,7 +230,7 @@ class MAMLLoss(object):
|
||||
tf.multiply(self.cur_kl_coeff, mean_inner_kl))
|
||||
self.loss = tf.reduce_mean(tf.stack(ppo_obj,
|
||||
axis=0)) + self.inner_kl_loss
|
||||
self.loss = tf.Print(
|
||||
self.loss = tf1.Print(
|
||||
self.loss,
|
||||
["Meta-Loss", self.loss, "Inner KL", self.mean_inner_kl])
|
||||
|
||||
@@ -309,7 +309,7 @@ class MAMLLoss(object):
|
||||
def maml_loss(policy, model, dist_class, train_batch):
|
||||
logits, state = model.from_batch(train_batch)
|
||||
|
||||
policy._loss_input_dict["split"] = tf.placeholder(
|
||||
policy._loss_input_dict["split"] = tf1.placeholder(
|
||||
tf.int32,
|
||||
name="Meta-Update-Splitting",
|
||||
shape=(policy.config["inner_adaptation_steps"] + 1,
|
||||
@@ -333,8 +333,8 @@ def maml_loss(policy, model, dist_class, train_batch):
|
||||
vf_loss_coeff=policy.config["vf_loss_coeff"],
|
||||
clip_loss=False)
|
||||
else:
|
||||
policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf.get_variable_scope().name)
|
||||
policy.var_list = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES,
|
||||
tf1.get_variable_scope().name)
|
||||
policy.loss_obj = MAMLLoss(
|
||||
model=model,
|
||||
dist_class=dist_class,
|
||||
@@ -380,8 +380,8 @@ class KLCoeffMixin:
|
||||
self.kl_coeff_val = [config["kl_coeff"]
|
||||
] * config["inner_adaptation_steps"]
|
||||
self.kl_target = self.config["kl_target"]
|
||||
self.kl_coeff = tf.get_variable(
|
||||
initializer=tf.constant_initializer(self.kl_coeff_val),
|
||||
self.kl_coeff = tf1.get_variable(
|
||||
initializer=tf.keras.initializers.Constant(self.kl_coeff_val),
|
||||
name="kl_coeff",
|
||||
shape=(config["inner_adaptation_steps"]),
|
||||
trainable=False,
|
||||
@@ -404,8 +404,8 @@ def maml_optimizer_fn(policy, config):
|
||||
Meta-Policy uses Adam optimizer for meta-update
|
||||
"""
|
||||
if not config["worker_index"]:
|
||||
return tf.train.AdamOptimizer(learning_rate=config["lr"])
|
||||
return tf.train.GradientDescentOptimizer(learning_rate=config["inner_lr"])
|
||||
return tf1.train.AdamOptimizer(learning_rate=config["lr"])
|
||||
return tf1.train.GradientDescentOptimizer(learning_rate=config["inner_lr"])
|
||||
|
||||
|
||||
def setup_mixins(policy, obs_space, action_space, config):
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class ValueNetworkMixin:
|
||||
@@ -37,13 +37,13 @@ class ReweightedImitationLoss:
|
||||
# advantage estimation
|
||||
adv = cumulative_rewards - state_values
|
||||
# update averaged advantage norm
|
||||
update_adv_norm = tf.assign_add(
|
||||
update_adv_norm = tf1.assign_add(
|
||||
ref=policy._ma_adv_norm,
|
||||
value=1e-6 * (
|
||||
tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))
|
||||
|
||||
# exponentially weighted advantages
|
||||
with tf.control_dependencies([update_adv_norm]):
|
||||
with tf1.control_dependencies([update_adv_norm]):
|
||||
exp_advs = tf.math.exp(beta * tf.math.divide(
|
||||
adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))
|
||||
|
||||
@@ -125,7 +125,7 @@ def setup_mixins(policy, obs_space, action_space, config):
|
||||
ValueNetworkMixin.__init__(policy)
|
||||
# Set up a tf-var for the moving avg (do this here to make it work with
|
||||
# eager mode).
|
||||
policy._ma_adv_norm = tf.get_variable(
|
||||
policy._ma_adv_norm = tf1.get_variable(
|
||||
name="moving_average_of_advantage_norm",
|
||||
dtype=tf.float32,
|
||||
initializer=100.0,
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class TestMARWIL(unittest.TestCase):
|
||||
|
||||
@@ -5,7 +5,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
def post_process_advantages(policy,
|
||||
|
||||
@@ -21,7 +21,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
POLICY_SCOPE = "func"
|
||||
TARGET_POLICY_SCOPE = "target_func"
|
||||
@@ -65,7 +65,7 @@ class PPOSurrogateLoss:
|
||||
def reduce_mean_valid(t):
|
||||
return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
|
||||
|
||||
logp_ratio = tf.exp(actions_logp - prev_actions_logp)
|
||||
logp_ratio = tf.math.exp(actions_logp - prev_actions_logp)
|
||||
|
||||
surrogate_loss = tf.minimum(
|
||||
advantages * logp_ratio,
|
||||
@@ -170,7 +170,7 @@ class VTraceSurrogateLoss:
|
||||
tf.float32))
|
||||
|
||||
self.is_ratio = tf.clip_by_value(
|
||||
tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
|
||||
tf.math.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
|
||||
logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp)
|
||||
|
||||
advantages = self.vtrace_returns.pg_advantages
|
||||
|
||||
@@ -7,9 +7,6 @@ from ray.rllib.execution.rollout_ops import ParallelRollouts, ConcatBatches, \
|
||||
StandardizeFields, SelectExperiences
|
||||
from ray.rllib.execution.train_ops import TrainOneStep, TrainTFMultiGPU
|
||||
from ray.rllib.execution.metric_ops import StandardMetricsReporting
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -174,7 +174,7 @@ def postprocess_ppo_gae(policy,
|
||||
else:
|
||||
next_state = []
|
||||
for i in range(policy.num_state_tensors()):
|
||||
next_state.append([sample_batch["state_out_{}".format(i)][-1]])
|
||||
next_state.append(sample_batch["state_out_{}".format(i)][-1])
|
||||
last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
|
||||
sample_batch[SampleBatch.ACTIONS][-1],
|
||||
sample_batch[SampleBatch.REWARDS][-1],
|
||||
@@ -206,7 +206,7 @@ class KLCoeffMixin:
|
||||
# KL Coefficient
|
||||
self.kl_coeff_val = config["kl_coeff"]
|
||||
self.kl_target = config["kl_target"]
|
||||
self.kl_coeff = tf.get_variable(
|
||||
self.kl_coeff = tf1.get_variable(
|
||||
initializer=tf.constant_initializer(self.kl_coeff_val),
|
||||
name="kl_coeff",
|
||||
shape=(),
|
||||
|
||||
@@ -194,7 +194,7 @@ class ValueNetworkMixin:
|
||||
SampleBatch.PREV_REWARDS: convert_to_torch_tensor(
|
||||
np.asarray([prev_reward])),
|
||||
"is_training": False,
|
||||
}, [convert_to_torch_tensor(np.asarray(s)) for s in state],
|
||||
}, [convert_to_torch_tensor(np.asarray([s])) for s in state],
|
||||
convert_to_torch_tensor(np.asarray([1])))
|
||||
return self.model.value_function()[0]
|
||||
|
||||
|
||||
@@ -2,12 +2,9 @@ import unittest
|
||||
|
||||
import ray
|
||||
import ray.rllib.agents.ppo as ppo
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class TestAPPO(unittest.TestCase):
|
||||
@classmethod
|
||||
|
||||
@@ -2,12 +2,9 @@ import unittest
|
||||
|
||||
import ray
|
||||
import ray.rllib.agents.ppo as ppo
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_compute_single_action, \
|
||||
framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class TestDDPPO(unittest.TestCase):
|
||||
@classmethod
|
||||
|
||||
@@ -13,12 +13,10 @@ from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.models.torch.torch_action_dist import TorchCategorical
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.numpy import fc
|
||||
from ray.rllib.utils.test_utils import check, framework_iterator, \
|
||||
check_compute_single_action
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
# Fake CartPole episode of n time steps.
|
||||
FAKE_BATCH = {
|
||||
@@ -40,7 +38,7 @@ FAKE_BATCH = {
|
||||
class TestPPO(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
ray.init()
|
||||
ray.init(local_mode=True)
|
||||
|
||||
@classmethod
|
||||
def tearDownClass(cls):
|
||||
|
||||
@@ -4,7 +4,7 @@ import numpy as np
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class SACTFModel(TFModelV2):
|
||||
|
||||
@@ -17,7 +17,7 @@ from ray.rllib.utils.error import UnsupportedSpaceException
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_tfp
|
||||
from ray.rllib.utils.tf_ops import minimize_and_clip
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
tfp = try_import_tfp()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -138,10 +138,10 @@ def sac_actor_critic_loss(policy, model, _, train_batch):
|
||||
if model.discrete:
|
||||
# Get all action probs directly from pi and form their logp.
|
||||
log_pis_t = tf.nn.log_softmax(model.get_policy_output(model_out_t), -1)
|
||||
policy_t = tf.exp(log_pis_t)
|
||||
policy_t = tf.math.exp(log_pis_t)
|
||||
log_pis_tp1 = tf.nn.log_softmax(
|
||||
model.get_policy_output(model_out_tp1), -1)
|
||||
policy_tp1 = tf.exp(log_pis_tp1)
|
||||
policy_tp1 = tf.math.exp(log_pis_tp1)
|
||||
# Q-values.
|
||||
q_t = model.get_q_values(model_out_t)
|
||||
# Target Q-values.
|
||||
@@ -219,20 +219,20 @@ def sac_actor_critic_loss(policy, model, _, train_batch):
|
||||
policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)
|
||||
|
||||
# Compute the TD-error (potentially clipped).
|
||||
base_td_error = tf.abs(q_t_selected - q_t_selected_target)
|
||||
base_td_error = tf.math.abs(q_t_selected - q_t_selected_target)
|
||||
if policy.config["twin_q"]:
|
||||
twin_td_error = tf.abs(twin_q_t_selected - q_t_selected_target)
|
||||
twin_td_error = tf.math.abs(twin_q_t_selected - q_t_selected_target)
|
||||
td_error = 0.5 * (base_td_error + twin_td_error)
|
||||
else:
|
||||
td_error = base_td_error
|
||||
|
||||
critic_loss = [
|
||||
tf.losses.mean_squared_error(
|
||||
tf1.losses.mean_squared_error(
|
||||
labels=q_t_selected_target, predictions=q_t_selected, weights=0.5)
|
||||
]
|
||||
if policy.config["twin_q"]:
|
||||
critic_loss.append(
|
||||
tf.losses.mean_squared_error(
|
||||
tf1.losses.mean_squared_error(
|
||||
labels=q_t_selected_target,
|
||||
predictions=twin_q_t_selected,
|
||||
weights=0.5))
|
||||
@@ -274,7 +274,7 @@ def sac_actor_critic_loss(policy, model, _, train_batch):
|
||||
|
||||
# in a custom apply op we handle the losses separately, but return them
|
||||
# combined in one loss for now
|
||||
return actor_loss + tf.add_n(critic_loss) + alpha_loss
|
||||
return actor_loss + tf.math.add_n(critic_loss) + alpha_loss
|
||||
|
||||
|
||||
def gradients(policy, optimizer, loss):
|
||||
@@ -358,7 +358,7 @@ def apply_gradients(policy, optimizer, grads_and_vars):
|
||||
|
||||
alpha_apply_ops = policy._alpha_optimizer.apply_gradients(
|
||||
policy._alpha_grads_and_vars,
|
||||
global_step=tf.train.get_or_create_global_step())
|
||||
global_step=tf1.train.get_or_create_global_step())
|
||||
return tf.group([actor_apply_ops, alpha_apply_ops] + critic_apply_ops)
|
||||
|
||||
|
||||
@@ -381,20 +381,20 @@ def stats(policy, train_batch):
|
||||
class ActorCriticOptimizerMixin:
|
||||
def __init__(self, config):
|
||||
# create global step for counting the number of update operations
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
self.global_step = tf1.train.get_or_create_global_step()
|
||||
|
||||
# use separate optimizers for actor & critic
|
||||
self._actor_optimizer = tf.train.AdamOptimizer(
|
||||
self._actor_optimizer = tf1.train.AdamOptimizer(
|
||||
learning_rate=config["optimization"]["actor_learning_rate"])
|
||||
self._critic_optimizer = [
|
||||
tf.train.AdamOptimizer(
|
||||
tf1.train.AdamOptimizer(
|
||||
learning_rate=config["optimization"]["critic_learning_rate"])
|
||||
]
|
||||
if config["twin_q"]:
|
||||
self._critic_optimizer.append(
|
||||
tf.train.AdamOptimizer(learning_rate=config["optimization"][
|
||||
tf1.train.AdamOptimizer(learning_rate=config["optimization"][
|
||||
"critic_learning_rate"]))
|
||||
self._alpha_optimizer = tf.train.AdamOptimizer(
|
||||
self._alpha_optimizer = tf1.train.AdamOptimizer(
|
||||
learning_rate=config["optimization"]["entropy_learning_rate"])
|
||||
|
||||
|
||||
|
||||
@@ -11,13 +11,12 @@ from ray.rllib.models.tf.tf_action_dist import SquashedGaussian
|
||||
from ray.rllib.models.torch.torch_action_dist import TorchSquashedGaussian
|
||||
from ray.rllib.execution.replay_buffer import LocalReplayBuffer
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.framework import try_import_torch
|
||||
from ray.rllib.utils.numpy import fc, relu
|
||||
from ray.rllib.utils.test_utils import check, check_compute_single_action, \
|
||||
framework_iterator
|
||||
from ray.rllib.utils.torch_ops import convert_to_torch_tensor
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ from ray.tune.resources import Resources
|
||||
from ray.tune.logger import Logger, UnifiedLogger
|
||||
from ray.tune.result import DEFAULT_RESULTS_DIR
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -595,12 +595,12 @@ class Trainer(Trainable):
|
||||
self.config.pop("eager")
|
||||
|
||||
# Enable eager/tracing support.
|
||||
if tf and self.config["framework"] == "tfe":
|
||||
if not tf.executing_eagerly():
|
||||
tf.enable_eager_execution()
|
||||
if tf1 and self.config["framework"] == "tfe":
|
||||
if not tf1.executing_eagerly():
|
||||
tf1.enable_eager_execution()
|
||||
logger.info("Executing eagerly, with eager_tracing={}".format(
|
||||
self.config["eager_tracing"]))
|
||||
if tf and not tf.executing_eagerly() and \
|
||||
if tf1 and not tf1.executing_eagerly() and \
|
||||
self.config["framework"] != "torch":
|
||||
logger.info("Tip: set framework=tfe or the --eager flag to enable "
|
||||
"TensorFlow eager execution")
|
||||
@@ -634,8 +634,8 @@ class Trainer(Trainable):
|
||||
logging.getLogger("ray.rllib").setLevel(self.config["log_level"])
|
||||
|
||||
def get_scope():
|
||||
if tf and not tf.executing_eagerly():
|
||||
return tf.Graph().as_default()
|
||||
if tf1 and not tf1.executing_eagerly():
|
||||
return tf1.Graph().as_default()
|
||||
else:
|
||||
return open(os.devnull) # fake a no-op scope
|
||||
|
||||
|
||||
@@ -12,14 +12,13 @@ from ray.rllib.execution.metric_ops import StandardMetricsReporting
|
||||
from ray.rllib.models.catalog import ModelCatalog
|
||||
from ray.rllib.models.model import restore_original_dimensions
|
||||
from ray.rllib.models.torch.torch_action_dist import TorchCategorical
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.framework import try_import_torch
|
||||
from ray.tune.registry import ENV_CREATOR, _global_registry
|
||||
|
||||
from ray.rllib.contrib.alpha_zero.core.alpha_zero_policy import AlphaZeroPolicy
|
||||
from ray.rllib.contrib.alpha_zero.core.mcts import MCTS
|
||||
from ray.rllib.contrib.alpha_zero.core.ranked_rewards import get_r2_env_wrapper
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -15,7 +15,7 @@ import numpy as np
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
tfp = try_import_tfp()
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
# _____ Initial Configuration
|
||||
config = dict(ray.rllib.contrib.maddpg.DEFAULT_CONFIG, **config)
|
||||
self.config = config
|
||||
self.global_step = tf.train.get_or_create_global_step()
|
||||
self.global_step = tf1.train.get_or_create_global_step()
|
||||
|
||||
# FIXME: Get done from info is required since agentwise done is not
|
||||
# supported now.
|
||||
@@ -88,7 +88,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
# Placeholders for policy evaluation and updates
|
||||
def _make_ph_n(space_n, name=""):
|
||||
return [
|
||||
tf.placeholder(
|
||||
tf1.placeholder(
|
||||
tf.float32,
|
||||
shape=(None, ) + space.shape,
|
||||
name=name + "_%d" % i) for i, space in enumerate(space_n)
|
||||
@@ -98,9 +98,9 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
act_ph_n = _make_ph_n(act_space_n, "actions")
|
||||
new_obs_ph_n = _make_ph_n(obs_space_n, "new_obs")
|
||||
new_act_ph_n = _make_ph_n(act_space_n, "new_actions")
|
||||
rew_ph = tf.placeholder(
|
||||
rew_ph = tf1.placeholder(
|
||||
tf.float32, shape=None, name="rewards_{}".format(agent_id))
|
||||
done_ph = tf.placeholder(
|
||||
done_ph = tf1.placeholder(
|
||||
tf.float32, shape=None, name="dones_{}".format(agent_id))
|
||||
|
||||
if config["use_local_critic"]:
|
||||
@@ -190,12 +190,12 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
|
||||
# _____ Optimizers
|
||||
self.optimizers = {
|
||||
"critic": tf.train.AdamOptimizer(config["critic_lr"]),
|
||||
"actor": tf.train.AdamOptimizer(config["actor_lr"])
|
||||
"critic": tf1.train.AdamOptimizer(config["critic_lr"]),
|
||||
"actor": tf1.train.AdamOptimizer(config["actor_lr"])
|
||||
}
|
||||
|
||||
# _____ Build variable update ops.
|
||||
self.tau = tf.placeholder_with_default(
|
||||
self.tau = tf1.placeholder_with_default(
|
||||
config["tau"], shape=(), name="tau")
|
||||
|
||||
def _make_target_update_op(vs, target_vs, tau):
|
||||
@@ -213,7 +213,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
for v in variables.values():
|
||||
vs += v
|
||||
phs = [
|
||||
tf.placeholder(
|
||||
tf1.placeholder(
|
||||
tf.float32,
|
||||
shape=v.get_shape(),
|
||||
name=v.name.split(":")[0] + "_ph") for v in vs
|
||||
@@ -230,7 +230,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
|
||||
# _____ TensorFlow Initialization
|
||||
|
||||
self.sess = tf.get_default_session()
|
||||
self.sess = tf1.get_default_session()
|
||||
|
||||
def _make_loss_inputs(placeholders):
|
||||
return [(ph.name.split("/")[-1].split(":")[0], ph)
|
||||
@@ -251,7 +251,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
loss_inputs=loss_inputs,
|
||||
dist_inputs=actor_feature)
|
||||
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
# Hard initial update
|
||||
self.update_target(1.0)
|
||||
@@ -280,8 +280,8 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
critic_apply_op = self.optimizers["critic"].apply_gradients(
|
||||
self.gvs["critic"])
|
||||
|
||||
with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
|
||||
with tf.control_dependencies([critic_apply_op]):
|
||||
with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]):
|
||||
with tf1.control_dependencies([critic_apply_op]):
|
||||
actor_apply_op = self.optimizers["actor"].apply_gradients(
|
||||
self.gvs["actor"])
|
||||
|
||||
@@ -324,7 +324,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
hiddens,
|
||||
activation=None,
|
||||
scope=None):
|
||||
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
|
||||
with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
|
||||
if use_state_preprocessor:
|
||||
model_n = [
|
||||
ModelCatalog.get_model({
|
||||
@@ -341,11 +341,12 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
out = tf.concat(obs_n + act_n, axis=1)
|
||||
|
||||
for hidden in hiddens:
|
||||
out = tf.layers.dense(out, units=hidden, activation=activation)
|
||||
out = tf1.layers.dense(
|
||||
out, units=hidden, activation=activation)
|
||||
feature = out
|
||||
out = tf.layers.dense(feature, units=1, activation=None)
|
||||
out = tf1.layers.dense(feature, units=1, activation=None)
|
||||
|
||||
return out, feature, model_n, tf.global_variables(scope.name)
|
||||
return out, feature, model_n, tf1.global_variables(scope.name)
|
||||
|
||||
def _build_actor_network(self,
|
||||
obs,
|
||||
@@ -355,7 +356,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
hiddens,
|
||||
activation=None,
|
||||
scope=None):
|
||||
with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
|
||||
with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
|
||||
if use_state_preprocessor:
|
||||
model = ModelCatalog.get_model({
|
||||
"obs": obs,
|
||||
@@ -367,13 +368,14 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
|
||||
out = obs
|
||||
|
||||
for hidden in hiddens:
|
||||
out = tf.layers.dense(out, units=hidden, activation=activation)
|
||||
feature = tf.layers.dense(
|
||||
out = tf1.layers.dense(
|
||||
out, units=hidden, activation=activation)
|
||||
feature = tf1.layers.dense(
|
||||
out, units=act_space.shape[0], activation=None)
|
||||
sampler = tfp.distributions.RelaxedOneHotCategorical(
|
||||
temperature=1.0, logits=feature).sample()
|
||||
|
||||
return sampler, feature, model, tf.global_variables(scope.name)
|
||||
return sampler, feature, model, tf1.global_variables(scope.name)
|
||||
|
||||
def update_target(self, tau=None):
|
||||
if tau is not None:
|
||||
|
||||
@@ -50,7 +50,7 @@ if TYPE_CHECKING:
|
||||
# Generic type var for foreach_* methods.
|
||||
T = TypeVar("T")
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -283,12 +283,12 @@ class RolloutWorker(ParallelIteratorWorker):
|
||||
ParallelIteratorWorker.__init__(self, gen_rollouts, False)
|
||||
|
||||
policy_config: TrainerConfigDict = policy_config or {}
|
||||
if (tf and policy_config.get("framework") == "tfe"
|
||||
if (tf1 and policy_config.get("framework") == "tfe"
|
||||
and not policy_config.get("no_eager_on_workers")
|
||||
# This eager check is necessary for certain all-framework tests
|
||||
# that use tf's eager_mode() context generator.
|
||||
and not tf.executing_eagerly()):
|
||||
tf.enable_eager_execution()
|
||||
and not tf1.executing_eagerly()):
|
||||
tf1.enable_eager_execution()
|
||||
|
||||
if log_level:
|
||||
logging.getLogger("ray.rllib").setLevel(log_level)
|
||||
@@ -382,21 +382,21 @@ class RolloutWorker(ParallelIteratorWorker):
|
||||
torch.manual_seed(seed)
|
||||
except AssertionError:
|
||||
logger.info("Could not seed torch")
|
||||
if _has_tensorflow_graph(policy_dict) and not (tf and
|
||||
tf.executing_eagerly()):
|
||||
if not tf:
|
||||
if _has_tensorflow_graph(policy_dict) and not (
|
||||
tf1 and tf1.executing_eagerly()):
|
||||
if not tf1:
|
||||
raise ImportError("Could not import tensorflow")
|
||||
with tf.Graph().as_default():
|
||||
with tf1.Graph().as_default():
|
||||
if tf_session_creator:
|
||||
self.tf_sess = tf_session_creator()
|
||||
else:
|
||||
self.tf_sess = tf.Session(
|
||||
config=tf.ConfigProto(
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
self.tf_sess = tf1.Session(
|
||||
config=tf1.ConfigProto(
|
||||
gpu_options=tf1.GPUOptions(allow_growth=True)))
|
||||
with self.tf_sess.as_default():
|
||||
# set graph-level seed
|
||||
if seed is not None:
|
||||
tf.set_random_seed(seed)
|
||||
tf1.set_random_seed(seed)
|
||||
self.policy_map, self.preprocessors = \
|
||||
self._build_policy_map(policy_dict, policy_config)
|
||||
if (ray.is_initialized()
|
||||
@@ -406,7 +406,7 @@ class RolloutWorker(ParallelIteratorWorker):
|
||||
"Creating policy evaluation worker {}".format(
|
||||
worker_index) +
|
||||
" on CPU (please ignore any CUDA init errors)")
|
||||
elif not tf.test.is_gpu_available():
|
||||
elif not tf1.test.is_gpu_available():
|
||||
raise RuntimeError(
|
||||
"GPUs were assigned to this worker by Ray, but "
|
||||
"TensorFlow reports GPU acceleration is disabled. "
|
||||
@@ -956,7 +956,7 @@ class RolloutWorker(ParallelIteratorWorker):
|
||||
"Found raw Tuple|Dict space as input to policy. "
|
||||
"Please preprocess these observations with a "
|
||||
"Tuple|DictFlatteningPreprocessor.")
|
||||
if tf and tf.executing_eagerly():
|
||||
if tf1 and tf1.executing_eagerly():
|
||||
if hasattr(cls, "as_eager"):
|
||||
cls = cls.as_eager()
|
||||
if policy_config["eager_tracing"]:
|
||||
@@ -966,8 +966,8 @@ class RolloutWorker(ParallelIteratorWorker):
|
||||
else:
|
||||
raise ValueError("This policy does not support eager "
|
||||
"execution: {}".format(cls))
|
||||
if tf:
|
||||
with tf.variable_scope(name):
|
||||
if tf1:
|
||||
with tf1.variable_scope(name):
|
||||
policy_map[name] = cls(obs_space, act_space, merged_conf)
|
||||
else:
|
||||
policy_map[name] = cls(obs_space, act_space, merged_conf)
|
||||
|
||||
@@ -14,7 +14,7 @@ from ray.rllib.utils import merge_dicts
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.types import PolicyID, TrainerConfigDict, EnvType
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -202,8 +202,8 @@ class WorkerSet:
|
||||
def session_creator():
|
||||
logger.debug("Creating TF session {}".format(
|
||||
config["tf_session_args"]))
|
||||
return tf.Session(
|
||||
config=tf.ConfigProto(**config["tf_session_args"]))
|
||||
return tf1.Session(
|
||||
config=tf1.ConfigProto(**config["tf_session_args"]))
|
||||
|
||||
if isinstance(config["input"], FunctionType):
|
||||
input_creator = config["input"]
|
||||
|
||||
@@ -11,7 +11,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
from ray.tune import registry
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
|
||||
@@ -4,7 +4,7 @@ import numpy as np
|
||||
from rllib.models.tf.attention_net import TrXLNet
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
def bit_shift_generator(seq_length, shift, batch_size):
|
||||
|
||||
@@ -10,7 +10,7 @@ from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
|
||||
@@ -39,7 +39,7 @@ from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
|
||||
from ray.rllib.utils.torch_ops import convert_to_torch_tensor
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
OPPONENT_OBS = "opponent_obs"
|
||||
|
||||
@@ -23,7 +23,7 @@ from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -12,7 +12,7 @@ from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.visionnet import VisionNetwork as MyVisionNetwork
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="DQN") # Try PG, PPO, DQN
|
||||
|
||||
@@ -21,7 +21,7 @@ from ray.rllib.examples.models.custom_loss_model import CustomLossModel, \
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.evaluation.postprocessing import discount
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
|
||||
@@ -11,7 +11,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
|
||||
@@ -6,7 +6,7 @@ import ray
|
||||
from ray.rllib.agents.registry import get_agent_class
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
ray.init(num_cpus=10)
|
||||
|
||||
@@ -25,14 +25,14 @@ def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix):
|
||||
|
||||
def restore_saved_model(export_dir):
|
||||
signature_key = \
|
||||
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
|
||||
g = tf.Graph()
|
||||
tf1.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
|
||||
g = tf1.Graph()
|
||||
with g.as_default():
|
||||
with tf.Session(graph=g) as sess:
|
||||
with tf1.Session(graph=g) as sess:
|
||||
meta_graph_def = \
|
||||
tf.saved_model.load(sess,
|
||||
[tf.saved_model.tag_constants.SERVING],
|
||||
export_dir)
|
||||
tf1.saved_model.load(sess,
|
||||
[tf1.saved_model.tag_constants.SERVING],
|
||||
export_dir)
|
||||
print("Model restored!")
|
||||
print("Signature Def Information:")
|
||||
print(meta_graph_def.signature_def[signature_key])
|
||||
@@ -41,13 +41,13 @@ def restore_saved_model(export_dir):
|
||||
|
||||
|
||||
def restore_checkpoint(export_dir, prefix):
|
||||
sess = tf.Session()
|
||||
sess = tf1.Session()
|
||||
meta_file = "%s.meta" % prefix
|
||||
saver = tf.train.import_meta_graph(os.path.join(export_dir, meta_file))
|
||||
saver = tf1.train.import_meta_graph(os.path.join(export_dir, meta_file))
|
||||
saver.restore(sess, os.path.join(export_dir, prefix))
|
||||
print("Checkpoint restored!")
|
||||
print("Variables Information:")
|
||||
for v in tf.trainable_variables():
|
||||
for v in tf1.trainable_variables():
|
||||
value = sess.run(v)
|
||||
print(v.name, value)
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
cnn_shape = (4, 4, 3)
|
||||
# The torch version of MobileNetV2 does channels first.
|
||||
|
||||
@@ -3,7 +3,7 @@ from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
|
||||
TorchDistributionWrapper
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.models.torch.misc import SlimFC
|
||||
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
@@ -39,27 +39,27 @@ class BatchNormModel(TFModelV2):
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
last_layer = input_dict["obs"]
|
||||
hiddens = [256, 256]
|
||||
with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope("model", reuse=tf1.AUTO_REUSE):
|
||||
for i, size in enumerate(hiddens):
|
||||
last_layer = tf.layers.dense(
|
||||
last_layer = tf1.layers.dense(
|
||||
last_layer,
|
||||
size,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
activation=tf.nn.tanh,
|
||||
name="fc{}".format(i))
|
||||
# Add a batch norm layer
|
||||
last_layer = tf.layers.batch_normalization(
|
||||
last_layer = tf1.layers.batch_normalization(
|
||||
last_layer,
|
||||
training=input_dict["is_training"],
|
||||
name="bn_{}".format(i))
|
||||
|
||||
output = tf.layers.dense(
|
||||
output = tf1.layers.dense(
|
||||
last_layer,
|
||||
self.num_outputs,
|
||||
kernel_initializer=normc_initializer(0.01),
|
||||
activation=None,
|
||||
name="out")
|
||||
self._value_out = tf.layers.dense(
|
||||
self._value_out = tf1.layers.dense(
|
||||
last_layer,
|
||||
1,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
@@ -67,8 +67,8 @@ class BatchNormModel(TFModelV2):
|
||||
name="vf")
|
||||
if not self._registered:
|
||||
self.register_variables(
|
||||
tf.get_collection(
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
|
||||
tf1.get_collection(
|
||||
tf1.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
|
||||
self._registered = True
|
||||
|
||||
return output, []
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.offline import JsonReader
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
@@ -73,7 +73,7 @@ class DeprecatedCustomLossModelV1(Model):
|
||||
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
self.obs_in = input_dict["obs"]
|
||||
with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope("shared", reuse=tf1.AUTO_REUSE):
|
||||
self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
|
||||
self.action_space, num_outputs,
|
||||
options)
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class EagerModel(TFModelV2):
|
||||
@@ -34,7 +34,7 @@ class EagerModel(TFModelV2):
|
||||
|
||||
def lambda_(x):
|
||||
eager_out = tf.py_function(self.forward_eager, [x], tf.float32)
|
||||
with tf.control_dependencies([eager_out]):
|
||||
with tf1.control_dependencies([eager_out]):
|
||||
eager_out.set_shape(x.shape)
|
||||
return eager_out
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
@@ -25,11 +25,11 @@ class FastModel(TFModelV2):
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
|
||||
bias = tf.get_variable(
|
||||
with tf1.variable_scope("model", reuse=tf1.AUTO_REUSE):
|
||||
bias = tf1.get_variable(
|
||||
dtype=tf.float32,
|
||||
name="bias",
|
||||
initializer=tf.zeros_initializer,
|
||||
initializer=tf.keras.initializers.Zeros(),
|
||||
shape=())
|
||||
output = bias + \
|
||||
tf.zeros([tf.shape(input_dict["obs"])[0], self.num_outputs])
|
||||
@@ -37,8 +37,8 @@ class FastModel(TFModelV2):
|
||||
|
||||
if not self._registered:
|
||||
self.register_variables(
|
||||
tf.get_collection(
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
|
||||
tf1.get_collection(
|
||||
tf1.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
|
||||
self._registered = True
|
||||
|
||||
return output, []
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.numpy import LARGE_INTEGER
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -8,13 +8,15 @@ from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class SpyLayer(tf.keras.layers.Layer):
|
||||
"""A keras Layer, which intercepts its inputs and stored them as pickled.
|
||||
"""
|
||||
|
||||
output = np.array(0, dtype=np.int64)
|
||||
|
||||
def __init__(self, num_outputs, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
@@ -26,7 +28,7 @@ class SpyLayer(tf.keras.layers.Layer):
|
||||
"""
|
||||
|
||||
del kwargs
|
||||
spy_fn = tf.py_func(
|
||||
spy_fn = tf1.py_func(
|
||||
self.spy,
|
||||
[
|
||||
inputs[0], # observations
|
||||
@@ -36,11 +38,11 @@ class SpyLayer(tf.keras.layers.Layer):
|
||||
inputs[5], # h_out
|
||||
inputs[6], # c_out
|
||||
],
|
||||
tf.int64,
|
||||
tf.int64, # Must match SpyLayer.output's type.
|
||||
stateful=True)
|
||||
|
||||
# Compute outputs
|
||||
with tf.control_dependencies([spy_fn]):
|
||||
with tf1.control_dependencies([spy_fn]):
|
||||
return self.dense(inputs[1])
|
||||
|
||||
@staticmethod
|
||||
@@ -48,7 +50,8 @@ class SpyLayer(tf.keras.layers.Layer):
|
||||
"""The actual spy operation: Store inputs in internal_kv."""
|
||||
|
||||
if len(inputs) == 1:
|
||||
return 0 # don't capture inference inputs
|
||||
# don't capture inference inputs
|
||||
return SpyLayer.output
|
||||
# TF runs this function in an isolated context, so we have to use
|
||||
# redis to communicate back to our suite
|
||||
ray.experimental.internal_kv._internal_kv_put(
|
||||
@@ -61,7 +64,7 @@ class SpyLayer(tf.keras.layers.Layer):
|
||||
}),
|
||||
overwrite=True)
|
||||
RNNSpyModel.capture_index += 1
|
||||
return 0
|
||||
return SpyLayer.output
|
||||
|
||||
|
||||
class RNNSpyModel(RecurrentNetwork):
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ class SharedWeightsModel1(TFModelV2):
|
||||
"""Example of weight sharing between two different TFModelV2s.
|
||||
|
||||
Here, we share the variables defined in the 'shared' variable scope
|
||||
by entering it explicitly with tf.AUTO_REUSE. This creates the
|
||||
by entering it explicitly with tf1.AUTO_REUSE. This creates the
|
||||
variables for the 'fc1' layer in a global scope called 'shared'
|
||||
(outside of the Policy's normal variable scope).
|
||||
"""
|
||||
@@ -26,9 +26,9 @@ class SharedWeightsModel1(TFModelV2):
|
||||
model_config, name)
|
||||
|
||||
inputs = tf.keras.layers.Input(observation_space.shape)
|
||||
with tf.variable_scope(
|
||||
tf.VariableScope(tf.AUTO_REUSE, "shared"),
|
||||
reuse=tf.AUTO_REUSE,
|
||||
with tf1.variable_scope(
|
||||
tf1.VariableScope(tf1.AUTO_REUSE, "shared"),
|
||||
reuse=tf1.AUTO_REUSE,
|
||||
auxiliary_name_scope=False):
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
units=64, activation=tf.nn.relu, name="fc1")(inputs)
|
||||
@@ -60,9 +60,9 @@ class SharedWeightsModel2(TFModelV2):
|
||||
inputs = tf.keras.layers.Input(observation_space.shape)
|
||||
|
||||
# Weights shared with SharedWeightsModel1.
|
||||
with tf.variable_scope(
|
||||
tf.VariableScope(tf.AUTO_REUSE, "shared"),
|
||||
reuse=tf.AUTO_REUSE,
|
||||
with tf1.variable_scope(
|
||||
tf1.VariableScope(tf1.AUTO_REUSE, "shared"),
|
||||
reuse=tf1.AUTO_REUSE,
|
||||
auxiliary_name_scope=False):
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
units=64, activation=tf.nn.relu, name="fc1")(inputs)
|
||||
|
||||
@@ -4,7 +4,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ from ray.rllib.examples.policy.rock_paper_scissors_dummies import \
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
@@ -5,7 +5,7 @@ from ray.util.debug import log_once
|
||||
from ray.rllib.utils.debug import summarize
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
# Variable scope in which created variables will be placed under
|
||||
TOWER_SCOPE_NAME = "tower"
|
||||
@@ -26,7 +26,7 @@ class LocalSyncParallelOptimizer:
|
||||
`load_data`, so you can make multiple passes (possibly in randomized order)
|
||||
over the same data once loaded.
|
||||
|
||||
This is similar to tf.train.SyncReplicasOptimizer, but works within a
|
||||
This is similar to tf1.train.SyncReplicasOptimizer, but works within a
|
||||
single TensorFlow graph, i.e. implements in-graph replicated training:
|
||||
|
||||
https://www.tensorflow.org/api_docs/python/tf/train/SyncReplicasOptimizer
|
||||
@@ -63,21 +63,21 @@ class LocalSyncParallelOptimizer:
|
||||
self.build_graph = build_graph
|
||||
|
||||
# First initialize the shared loss network
|
||||
with tf.name_scope(TOWER_SCOPE_NAME):
|
||||
with tf1.name_scope(TOWER_SCOPE_NAME):
|
||||
self._shared_loss = build_graph(self.loss_inputs)
|
||||
shared_ops = tf.get_collection(
|
||||
tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
|
||||
shared_ops = tf1.get_collection(
|
||||
tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)
|
||||
|
||||
# Then setup the per-device loss graphs that use the shared weights
|
||||
self._batch_index = tf.placeholder(tf.int32, name="batch_index")
|
||||
self._batch_index = tf1.placeholder(tf.int32, name="batch_index")
|
||||
|
||||
# Dynamic batch size, which may be shrunk if there isn't enough data
|
||||
self._per_device_batch_size = tf.placeholder(
|
||||
self._per_device_batch_size = tf1.placeholder(
|
||||
tf.int32, name="per_device_batch_size")
|
||||
self._loaded_per_device_batch_size = max_per_device_batch_size
|
||||
|
||||
# When loading RNN input, we dynamically determine the max seq len
|
||||
self._max_seq_len = tf.placeholder(tf.int32, name="max_seq_len")
|
||||
self._max_seq_len = tf1.placeholder(tf.int32, name="max_seq_len")
|
||||
self._loaded_max_seq_len = 1
|
||||
|
||||
# Split on the CPU in case the data doesn't fit in GPU memory.
|
||||
@@ -103,15 +103,15 @@ class LocalSyncParallelOptimizer:
|
||||
# gather update ops for any batch norm layers. TODO(ekl) here we will
|
||||
# use all the ops found which won't work for DQN / DDPG, but those
|
||||
# aren't supported with multi-gpu right now anyways.
|
||||
self._update_ops = tf.get_collection(
|
||||
tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
|
||||
self._update_ops = tf1.get_collection(
|
||||
tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)
|
||||
for op in shared_ops:
|
||||
self._update_ops.remove(op) # only care about tower update ops
|
||||
if self._update_ops:
|
||||
logger.debug("Update ops to run on apply gradient: {}".format(
|
||||
self._update_ops))
|
||||
|
||||
with tf.control_dependencies(self._update_ops):
|
||||
with tf1.control_dependencies(self._update_ops):
|
||||
self._train_op = self.optimizer.apply_gradients(avg)
|
||||
|
||||
def load_data(self, sess, inputs, state_inputs):
|
||||
@@ -265,11 +265,11 @@ class LocalSyncParallelOptimizer:
|
||||
def _setup_device(self, device, device_input_placeholders, num_data_in):
|
||||
assert num_data_in <= len(device_input_placeholders)
|
||||
with tf.device(device):
|
||||
with tf.name_scope(TOWER_SCOPE_NAME):
|
||||
with tf1.name_scope(TOWER_SCOPE_NAME):
|
||||
device_input_batches = []
|
||||
device_input_slices = []
|
||||
for i, ph in enumerate(device_input_placeholders):
|
||||
current_batch = tf.Variable(
|
||||
current_batch = tf1.Variable(
|
||||
ph,
|
||||
trainable=False,
|
||||
validate_shape=False,
|
||||
|
||||
@@ -13,7 +13,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.timer import TimerStat
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -84,14 +84,15 @@ class TFMultiGPULearner(LearnerThread):
|
||||
self.par_opt = []
|
||||
with self.local_worker.tf_sess.graph.as_default():
|
||||
with self.local_worker.tf_sess.as_default():
|
||||
with tf.variable_scope(DEFAULT_POLICY_ID, reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(
|
||||
DEFAULT_POLICY_ID, reuse=tf1.AUTO_REUSE):
|
||||
if self.policy._state_inputs:
|
||||
rnn_inputs = self.policy._state_inputs + [
|
||||
self.policy._seq_lens
|
||||
]
|
||||
else:
|
||||
rnn_inputs = []
|
||||
adam = tf.train.AdamOptimizer(self.lr)
|
||||
adam = tf1.train.AdamOptimizer(self.lr)
|
||||
for _ in range(num_data_loader_buffers):
|
||||
self.par_opt.append(
|
||||
LocalSyncParallelOptimizer(
|
||||
@@ -103,7 +104,7 @@ class TFMultiGPULearner(LearnerThread):
|
||||
self.policy.copy))
|
||||
|
||||
self.sess = self.local_worker.tf_sess
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
self.idle_optimizers = queue.Queue()
|
||||
self.ready_optimizers = queue.Queue()
|
||||
|
||||
@@ -20,7 +20,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.sgd import do_minibatch_sgd, averaged
|
||||
from ray.rllib.utils.types import PolicyID, SampleBatchType
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -137,7 +137,7 @@ class TrainTFMultiGPU:
|
||||
with self.workers.local_worker().tf_sess.as_default():
|
||||
for policy_id in self.policies:
|
||||
policy = self.workers.local_worker().get_policy(policy_id)
|
||||
with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(policy_id, reuse=tf1.AUTO_REUSE):
|
||||
if policy._state_inputs:
|
||||
rnn_inputs = policy._state_inputs + [
|
||||
policy._seq_lens
|
||||
@@ -152,7 +152,7 @@ class TrainTFMultiGPU:
|
||||
self.per_device_batch_size, policy.copy))
|
||||
|
||||
self.sess = self.workers.local_worker().tf_sess
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
def __call__(self,
|
||||
samples: SampleBatchType) -> (SampleBatchType, List[dict]):
|
||||
|
||||
@@ -27,7 +27,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.spaces.simplex import Simplex
|
||||
from ray.rllib.utils.spaces.space_utils import flatten_space
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
tree = try_import_tree()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -257,7 +257,7 @@ class ModelCatalog:
|
||||
|
||||
dtype, shape = ModelCatalog.get_action_shape(action_space)
|
||||
|
||||
return tf.placeholder(dtype, shape=shape, name=name)
|
||||
return tf1.placeholder(dtype, shape=shape, name=name)
|
||||
|
||||
@staticmethod
|
||||
@DeveloperAPI
|
||||
|
||||
@@ -8,7 +8,7 @@ from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI
|
||||
from ray.rllib.utils.deprecation import deprecation_warning
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -38,13 +38,13 @@ class Model:
|
||||
self.action_space = action_space
|
||||
self.num_outputs = num_outputs
|
||||
self.options = options
|
||||
self.scope = tf.get_variable_scope()
|
||||
self.session = tf.get_default_session()
|
||||
self.scope = tf1.get_variable_scope()
|
||||
self.session = tf1.get_default_session()
|
||||
self.input_dict = input_dict
|
||||
if seq_lens is not None:
|
||||
self.seq_lens = seq_lens
|
||||
else:
|
||||
self.seq_lens = tf.placeholder(
|
||||
self.seq_lens = tf1.placeholder(
|
||||
dtype=tf.int32, shape=[None], name="seq_lens")
|
||||
|
||||
self._num_outputs = num_outputs
|
||||
@@ -68,10 +68,10 @@ class Model:
|
||||
input_dict["obs"], num_outputs, options)
|
||||
|
||||
if options.get("free_log_std", False):
|
||||
log_std = tf.get_variable(
|
||||
log_std = tf1.get_variable(
|
||||
name="log_std",
|
||||
shape=[num_outputs],
|
||||
initializer=tf.zeros_initializer)
|
||||
initializer=tf1.zeros_initializer)
|
||||
self.outputs = tf.concat(
|
||||
[self.outputs, 0.0 * self.outputs + log_std], 1)
|
||||
|
||||
@@ -196,7 +196,7 @@ class Model:
|
||||
def flatten(obs, framework):
|
||||
"""Flatten the given tensor."""
|
||||
if framework == "tf":
|
||||
return tf.layers.flatten(obs)
|
||||
return tf1.layers.flatten(obs)
|
||||
elif framework == "torch":
|
||||
assert torch is not None
|
||||
return torch.flatten(obs, start_dim=1)
|
||||
|
||||
@@ -13,7 +13,7 @@ from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
|
||||
from ray.rllib.utils.spaces.repeated import Repeated
|
||||
from ray.rllib.utils.types import ModelConfigDict
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
|
||||
@@ -339,7 +339,7 @@ class NullContextManager:
|
||||
def flatten(obs, framework):
|
||||
"""Flatten the given tensor."""
|
||||
if framework == "tf":
|
||||
return tf.layers.flatten(obs)
|
||||
return tf1.layers.flatten(obs)
|
||||
elif framework == "torch":
|
||||
assert torch is not None
|
||||
return torch.flatten(obs, start_dim=1)
|
||||
|
||||
+3
-3
@@ -13,7 +13,7 @@ from ray.rllib.utils.framework import try_import_torch, try_import_tf
|
||||
from ray.rllib.utils.test_utils import framework_iterator
|
||||
|
||||
torch, nn = try_import_torch()
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class TestModules(unittest.TestCase):
|
||||
@@ -144,7 +144,7 @@ class TestModules(unittest.TestCase):
|
||||
model = TorchMultiHeadAttention(
|
||||
in_dim=D_in, out_dim=D_out, num_heads=2, head_dim=32)
|
||||
|
||||
self.train_torch_layer(model, x, y)
|
||||
self.train_torch_layer(model, x, y, num_epochs=500)
|
||||
|
||||
else: # framework is tensorflow or tensorflow-eager
|
||||
|
||||
@@ -165,7 +165,7 @@ class TestModules(unittest.TestCase):
|
||||
that it trains in a supervised setting."""
|
||||
|
||||
# Checks that torch and tf embedding matrices are the same
|
||||
with tf.Session().as_default() as sess:
|
||||
with tf1.Session().as_default() as sess:
|
||||
assert np.allclose(
|
||||
relative_position_embedding(20, 15).eval(session=sess),
|
||||
relative_position_embedding_torch(20, 15).numpy())
|
||||
@@ -16,7 +16,7 @@ from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, \
|
||||
softmax, SMALL_NUMBER, LARGE_INTEGER
|
||||
from ray.rllib.utils.test_utils import check, framework_iterator
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
tree = try_import_tree()
|
||||
|
||||
@@ -75,13 +75,13 @@ class TestDistributions(unittest.TestCase):
|
||||
def test_categorical(self):
|
||||
"""Tests the Categorical ActionDistribution (tf only)."""
|
||||
num_samples = 100000
|
||||
logits = tf.placeholder(tf.float32, shape=(None, 10))
|
||||
logits = tf1.placeholder(tf.float32, shape=(None, 10))
|
||||
z = 8 * (np.random.rand(10) - 0.5)
|
||||
data = np.tile(z, (num_samples, 1))
|
||||
c = Categorical(logits, {}) # dummy config dict
|
||||
sample_op = c.sample()
|
||||
sess = tf.Session()
|
||||
sess.run(tf.global_variables_initializer())
|
||||
sess = tf1.Session()
|
||||
sess.run(tf1.global_variables_initializer())
|
||||
samples = sess.run(sample_op, feed_dict={logits: data})
|
||||
counts = np.zeros(10)
|
||||
for sample in samples:
|
||||
|
||||
@@ -17,7 +17,7 @@ from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
# TODO(sven): Use RLlib's FCNet instead.
|
||||
|
||||
@@ -4,7 +4,7 @@ from ray.rllib.models.tf.misc import normc_initializer
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.framework import get_activation_fn, try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class FullyConnectedNetwork(TFModelV2):
|
||||
|
||||
@@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.deprecation import deprecation_warning
|
||||
from ray.rllib.utils.framework import get_activation_fn, try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
# Deprecated: see as an alternative models/tf.fcnet.py
|
||||
@@ -29,15 +29,15 @@ class FullyConnectedNetwork(Model):
|
||||
activation = get_activation_fn(options.get("fcnet_activation"))
|
||||
|
||||
if len(inputs.shape) > 2:
|
||||
inputs = tf.layers.flatten(inputs)
|
||||
inputs = tf1.layers.flatten(inputs)
|
||||
|
||||
with tf.name_scope("fc_net"):
|
||||
with tf1.name_scope("fc_net"):
|
||||
i = 1
|
||||
last_layer = inputs
|
||||
for size in hiddens:
|
||||
# skip final linear layer
|
||||
if options.get("no_final_linear") and i == len(hiddens):
|
||||
output = tf.layers.dense(
|
||||
output = tf1.layers.dense(
|
||||
last_layer,
|
||||
num_outputs,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
@@ -46,7 +46,7 @@ class FullyConnectedNetwork(Model):
|
||||
return output, output
|
||||
|
||||
label = "fc{}".format(i)
|
||||
last_layer = tf.layers.dense(
|
||||
last_layer = tf1.layers.dense(
|
||||
last_layer,
|
||||
size,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
@@ -54,7 +54,7 @@ class FullyConnectedNetwork(Model):
|
||||
name=label)
|
||||
i += 1
|
||||
|
||||
output = tf.layers.dense(
|
||||
output = tf1.layers.dense(
|
||||
last_layer,
|
||||
num_outputs,
|
||||
kernel_initializer=normc_initializer(0.01),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class GRUGate(tf.keras.layers.Layer):
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
"""
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class MultiHeadAttention(tf.keras.layers.Layer):
|
||||
|
||||
@@ -3,7 +3,7 @@ import numpy as np
|
||||
from ray.rllib.utils.framework import get_activation_fn, get_variable, \
|
||||
try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class NoisyLayer(tf.keras.layers.Layer):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class RelativeMultiHeadAttention(tf.keras.layers.Layer):
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class SkipConnection(tf.keras.layers.Layer):
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.deprecation import deprecation_warning
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
# Deprecated: see as an alternative models/tf/recurrent_net.py
|
||||
@@ -45,7 +45,7 @@ class LSTM(Model):
|
||||
last_layer = add_time_dimension(features, self.seq_lens)
|
||||
|
||||
# Setup the LSTM cell
|
||||
lstm = tf.nn.rnn_cell.LSTMCell(cell_size, state_is_tuple=True)
|
||||
lstm = tf1.nn.rnn_cell.LSTMCell(cell_size, state_is_tuple=True)
|
||||
self.state_init = [
|
||||
np.zeros(lstm.state_size.c, np.float32),
|
||||
np.zeros(lstm.state_size.h, np.float32)
|
||||
@@ -55,15 +55,15 @@ class LSTM(Model):
|
||||
if self.state_in:
|
||||
c_in, h_in = self.state_in
|
||||
else:
|
||||
c_in = tf.placeholder(
|
||||
c_in = tf1.placeholder(
|
||||
tf.float32, [None, lstm.state_size.c], name="c")
|
||||
h_in = tf.placeholder(
|
||||
h_in = tf1.placeholder(
|
||||
tf.float32, [None, lstm.state_size.h], name="h")
|
||||
self.state_in = [c_in, h_in]
|
||||
|
||||
# Setup LSTM outputs
|
||||
state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
|
||||
lstm_out, lstm_state = tf.nn.dynamic_rnn(
|
||||
state_in = tf1.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
|
||||
lstm_out, lstm_state = tf1.nn.dynamic_rnn(
|
||||
lstm,
|
||||
last_layer,
|
||||
initial_state=state_in,
|
||||
|
||||
+10
-10
@@ -1,7 +1,7 @@
|
||||
import numpy as np
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
def normc_initializer(std=1.0):
|
||||
@@ -24,7 +24,7 @@ def conv2d(x,
|
||||
if dtype is None:
|
||||
dtype = tf.float32
|
||||
|
||||
with tf.variable_scope(name):
|
||||
with tf1.variable_scope(name):
|
||||
stride_shape = [1, stride[0], stride[1], 1]
|
||||
filter_shape = [
|
||||
filter_size[0], filter_size[1],
|
||||
@@ -40,24 +40,24 @@ def conv2d(x,
|
||||
# Initialize weights with random weights.
|
||||
w_bound = np.sqrt(6 / (fan_in + fan_out))
|
||||
|
||||
w = tf.get_variable(
|
||||
w = tf1.get_variable(
|
||||
"W",
|
||||
filter_shape,
|
||||
dtype,
|
||||
tf.random_uniform_initializer(-w_bound, w_bound),
|
||||
tf1.random_uniform_initializer(-w_bound, w_bound),
|
||||
collections=collections)
|
||||
b = tf.get_variable(
|
||||
b = tf1.get_variable(
|
||||
"b", [1, 1, 1, num_filters],
|
||||
initializer=tf.constant_initializer(0.0),
|
||||
initializer=tf1.constant_initializer(0.0),
|
||||
collections=collections)
|
||||
return tf.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
return tf1.nn.conv2d(x, w, stride_shape, pad) + b
|
||||
|
||||
|
||||
def linear(x, size, name, initializer=None, bias_init=0):
|
||||
w = tf.get_variable(
|
||||
w = tf1.get_variable(
|
||||
name + "/w", [x.get_shape()[1], size], initializer=initializer)
|
||||
b = tf.get_variable(
|
||||
name + "/b", [size], initializer=tf.constant_initializer(bias_init))
|
||||
b = tf1.get_variable(
|
||||
name + "/b", [size], initializer=tf1.constant_initializer(bias_init))
|
||||
return tf.matmul(x, w) + b
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tf_ops import scope_vars
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -47,7 +47,7 @@ def make_v1_wrapper(legacy_model_cls):
|
||||
# Tracks update ops
|
||||
self._update_ops = None
|
||||
|
||||
with tf.variable_scope(self.name) as scope:
|
||||
with tf1.variable_scope(self.name) as scope:
|
||||
self.variable_scope = scope
|
||||
|
||||
@override(ModelV2)
|
||||
@@ -58,20 +58,20 @@ def make_v1_wrapper(legacy_model_cls):
|
||||
def __call__(self, input_dict, state, seq_lens):
|
||||
if self.cur_instance:
|
||||
# create a weight-sharing model copy
|
||||
with tf.variable_scope(self.cur_instance.scope, reuse=True):
|
||||
with tf1.variable_scope(self.cur_instance.scope, reuse=True):
|
||||
new_instance = self.legacy_model_cls(
|
||||
input_dict, self.obs_space, self.action_space,
|
||||
self.num_outputs, self.model_config, state, seq_lens)
|
||||
else:
|
||||
# create a new model instance
|
||||
with tf.variable_scope(self.name):
|
||||
with tf1.variable_scope(self.name):
|
||||
prev_update_ops = set(
|
||||
tf.get_collection(tf.GraphKeys.UPDATE_OPS))
|
||||
tf1.get_collection(tf1.GraphKeys.UPDATE_OPS))
|
||||
new_instance = self.legacy_model_cls(
|
||||
input_dict, self.obs_space, self.action_space,
|
||||
self.num_outputs, self.model_config, state, seq_lens)
|
||||
self._update_ops = list(
|
||||
set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
|
||||
set(tf1.get_collection(tf1.GraphKeys.UPDATE_OPS)) -
|
||||
prev_update_ops)
|
||||
if len(new_instance.state_init) != len(self.get_initial_state()):
|
||||
raise ValueError(
|
||||
@@ -112,8 +112,9 @@ def make_v1_wrapper(legacy_model_cls):
|
||||
def value_function(self):
|
||||
assert self.cur_instance is not None, "must call forward first"
|
||||
|
||||
with tf.variable_scope(self.variable_scope):
|
||||
with tf.variable_scope("value_function", reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(self.variable_scope):
|
||||
with tf1.variable_scope(
|
||||
"value_function", reuse=tf1.AUTO_REUSE):
|
||||
# Simple case: sharing the feature layer
|
||||
if self.model_config["vf_share_layers"]:
|
||||
return tf.reshape(
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
@DeveloperAPI
|
||||
@@ -160,18 +160,17 @@ class LSTMWrapper(RecurrentNetwork):
|
||||
|
||||
# Concat. prev-action/reward if required.
|
||||
if self.model_config["lstm_use_prev_action_reward"]:
|
||||
if self.model_config["lstm_use_prev_action_reward"]:
|
||||
wrapped_out = tf.concat(
|
||||
[
|
||||
wrapped_out,
|
||||
tf.reshape(
|
||||
tf.cast(input_dict[SampleBatch.PREV_ACTIONS],
|
||||
tf.float32), [-1, self.action_dim]),
|
||||
tf.reshape(
|
||||
tf.cast(input_dict[SampleBatch.PREV_REWARDS],
|
||||
tf.float32), [-1, 1]),
|
||||
],
|
||||
axis=1)
|
||||
wrapped_out = tf.concat(
|
||||
[
|
||||
wrapped_out,
|
||||
tf.reshape(
|
||||
tf.cast(input_dict[SampleBatch.PREV_ACTIONS],
|
||||
tf.float32), [-1, self.action_dim]),
|
||||
tf.reshape(
|
||||
tf.cast(input_dict[SampleBatch.PREV_REWARDS],
|
||||
tf.float32), [-1, 1]),
|
||||
],
|
||||
axis=1)
|
||||
|
||||
# Then through our LSTM.
|
||||
input_dict["obs_flat"] = wrapped_out
|
||||
|
||||
@@ -9,7 +9,7 @@ from ray.rllib.utils.annotations import override, DeveloperAPI
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_tfp
|
||||
from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
tfp = try_import_tfp()
|
||||
tree = try_import_tree()
|
||||
|
||||
@@ -85,7 +85,7 @@ class Categorical(TFActionDistribution):
|
||||
|
||||
@override(TFActionDistribution)
|
||||
def _build_sample_op(self):
|
||||
return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
|
||||
return tf.squeeze(tf.random.categorical(self.inputs, 1), axis=1)
|
||||
|
||||
@staticmethod
|
||||
@override(ActionDistribution)
|
||||
|
||||
@@ -2,7 +2,7 @@ from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.utils.annotations import override, PublicAPI
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
@PublicAPI
|
||||
@@ -39,10 +39,10 @@ class TFModelV2(ModelV2):
|
||||
name,
|
||||
framework="tf")
|
||||
self.var_list = []
|
||||
if tf.executing_eagerly():
|
||||
if tf1.executing_eagerly():
|
||||
self.graph = None
|
||||
else:
|
||||
self.graph = tf.get_default_graph()
|
||||
self.graph = tf1.get_default_graph()
|
||||
|
||||
def context(self):
|
||||
"""Returns a contextmanager for the current TF graph."""
|
||||
|
||||
@@ -3,7 +3,7 @@ from ray.rllib.models.tf.visionnet_v1 import _get_filter_config
|
||||
from ray.rllib.models.tf.misc import normc_initializer
|
||||
from ray.rllib.utils.framework import get_activation_fn, try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class VisionNetwork(TFModelV2):
|
||||
|
||||
@@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.deprecation import deprecation_warning
|
||||
from ray.rllib.utils.framework import get_activation_fn, try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
# Deprecated: see as an alternative models/tf.visionnet.py
|
||||
@@ -24,9 +24,9 @@ class VisionNetwork(Model):
|
||||
|
||||
activation = get_activation_fn(options.get("conv_activation"))
|
||||
|
||||
with tf.name_scope("vision_net"):
|
||||
with tf1.name_scope("vision_net"):
|
||||
for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
|
||||
inputs = tf.layers.conv2d(
|
||||
inputs = tf1.layers.conv2d(
|
||||
inputs,
|
||||
out_size,
|
||||
kernel,
|
||||
@@ -38,7 +38,7 @@ class VisionNetwork(Model):
|
||||
|
||||
# skip final linear layer
|
||||
if options.get("no_final_linear"):
|
||||
fc_out = tf.layers.conv2d(
|
||||
fc_out = tf1.layers.conv2d(
|
||||
inputs,
|
||||
num_outputs,
|
||||
kernel,
|
||||
@@ -48,7 +48,7 @@ class VisionNetwork(Model):
|
||||
name="fc_out")
|
||||
return flatten(fc_out), flatten(fc_out)
|
||||
|
||||
fc1 = tf.layers.conv2d(
|
||||
fc1 = tf1.layers.conv2d(
|
||||
inputs,
|
||||
out_size,
|
||||
kernel,
|
||||
@@ -56,7 +56,7 @@ class VisionNetwork(Model):
|
||||
activation=activation,
|
||||
padding="valid",
|
||||
name="fc1")
|
||||
fc2 = tf.layers.conv2d(
|
||||
fc2 = tf1.layers.conv2d(
|
||||
fc1,
|
||||
num_outputs, [1, 1],
|
||||
activation=None,
|
||||
|
||||
@@ -6,7 +6,7 @@ from ray.rllib.policy.sample_batch import MultiAgentBatch
|
||||
from ray.rllib.utils.annotations import PublicAPI
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -75,7 +75,7 @@ class InputReader:
|
||||
k: (-1, ) + s[1:]
|
||||
for (k, s) in [(k, batch[k].shape) for k in keys]
|
||||
}
|
||||
queue = tf.FIFOQueue(capacity=queue_size, dtypes=dtypes, names=keys)
|
||||
queue = tf1.FIFOQueue(capacity=queue_size, dtypes=dtypes, names=keys)
|
||||
tensors = queue.dequeue()
|
||||
|
||||
logger.info("Creating TF queue runner for {}".format(self))
|
||||
@@ -92,12 +92,12 @@ class _QueueRunner(threading.Thread):
|
||||
|
||||
def __init__(self, input_reader, queue, keys, dtypes):
|
||||
threading.Thread.__init__(self)
|
||||
self.sess = tf.get_default_session()
|
||||
self.sess = tf1.get_default_session()
|
||||
self.daemon = True
|
||||
self.input_reader = input_reader
|
||||
self.keys = keys
|
||||
self.queue = queue
|
||||
self.placeholders = [tf.placeholder(dtype) for dtype in dtypes]
|
||||
self.placeholders = [tf1.placeholder(dtype) for dtype in dtypes]
|
||||
self.enqueue_op = queue.enqueue(dict(zip(keys, self.placeholders)))
|
||||
|
||||
def enqueue(self, batch):
|
||||
|
||||
@@ -45,7 +45,7 @@ class JsonReader(InputReader):
|
||||
logger.warning(
|
||||
"Treating input directory as glob pattern: {}".format(
|
||||
inputs))
|
||||
if urlparse(inputs).scheme not in ["d", ""]:
|
||||
if urlparse(inputs).scheme not in ["", "c"]:
|
||||
raise ValueError(
|
||||
"Don't know how to glob over `{}`, ".format(inputs) +
|
||||
"please specify a list of files to read instead.")
|
||||
@@ -123,7 +123,7 @@ class JsonReader(InputReader):
|
||||
|
||||
def _next_file(self):
|
||||
path = random.choice(self.files)
|
||||
if urlparse(path).scheme:
|
||||
if urlparse(path).scheme not in ["", "c"]:
|
||||
if smart_open is None:
|
||||
raise ValueError(
|
||||
"You must install the `smart_open` module to read "
|
||||
|
||||
@@ -42,7 +42,7 @@ class JsonWriter(OutputWriter):
|
||||
self.ioctx = ioctx or IOContext()
|
||||
self.max_file_size = max_file_size
|
||||
self.compress_columns = compress_columns
|
||||
if urlparse(path).scheme:
|
||||
if urlparse(path).scheme not in ["", "c"]:
|
||||
self.path_is_uri = True
|
||||
else:
|
||||
path = os.path.abspath(os.path.expanduser(path))
|
||||
|
||||
@@ -15,7 +15,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.timer import TimerStat
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -86,14 +86,15 @@ class TFMultiGPULearner(LearnerThread):
|
||||
self.par_opt = []
|
||||
with self.local_worker.tf_sess.graph.as_default():
|
||||
with self.local_worker.tf_sess.as_default():
|
||||
with tf.variable_scope(DEFAULT_POLICY_ID, reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(
|
||||
DEFAULT_POLICY_ID, reuse=tf1.AUTO_REUSE):
|
||||
if self.policy._state_inputs:
|
||||
rnn_inputs = self.policy._state_inputs + [
|
||||
self.policy._seq_lens
|
||||
]
|
||||
else:
|
||||
rnn_inputs = []
|
||||
adam = tf.train.AdamOptimizer(self.lr)
|
||||
adam = tf1.train.AdamOptimizer(self.lr)
|
||||
for _ in range(num_data_loader_buffers):
|
||||
self.par_opt.append(
|
||||
LocalSyncParallelOptimizer(
|
||||
@@ -105,7 +106,7 @@ class TFMultiGPULearner(LearnerThread):
|
||||
self.policy.copy))
|
||||
|
||||
self.sess = self.local_worker.tf_sess
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
self.idle_optimizers = queue.Queue()
|
||||
self.ready_optimizers = queue.Queue()
|
||||
|
||||
@@ -5,7 +5,7 @@ from ray.util.debug import log_once
|
||||
from ray.rllib.utils.debug import summarize
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
# Variable scope in which created variables will be placed under
|
||||
TOWER_SCOPE_NAME = "tower"
|
||||
@@ -63,21 +63,21 @@ class LocalSyncParallelOptimizer:
|
||||
self.build_graph = build_graph
|
||||
|
||||
# First initialize the shared loss network
|
||||
with tf.name_scope(TOWER_SCOPE_NAME):
|
||||
with tf1.name_scope(TOWER_SCOPE_NAME):
|
||||
self._shared_loss = build_graph(self.loss_inputs)
|
||||
shared_ops = tf.get_collection(
|
||||
tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
|
||||
shared_ops = tf1.get_collection(
|
||||
tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)
|
||||
|
||||
# Then setup the per-device loss graphs that use the shared weights
|
||||
self._batch_index = tf.placeholder(tf.int32, name="batch_index")
|
||||
self._batch_index = tf1.placeholder(tf.int32, name="batch_index")
|
||||
|
||||
# Dynamic batch size, which may be shrunk if there isn't enough data
|
||||
self._per_device_batch_size = tf.placeholder(
|
||||
self._per_device_batch_size = tf1.placeholder(
|
||||
tf.int32, name="per_device_batch_size")
|
||||
self._loaded_per_device_batch_size = max_per_device_batch_size
|
||||
|
||||
# When loading RNN input, we dynamically determine the max seq len
|
||||
self._max_seq_len = tf.placeholder(tf.int32, name="max_seq_len")
|
||||
self._max_seq_len = tf1.placeholder(tf.int32, name="max_seq_len")
|
||||
self._loaded_max_seq_len = 1
|
||||
|
||||
# Split on the CPU in case the data doesn't fit in GPU memory.
|
||||
@@ -103,15 +103,15 @@ class LocalSyncParallelOptimizer:
|
||||
# gather update ops for any batch norm layers. TODO(ekl) here we will
|
||||
# use all the ops found which won't work for DQN / DDPG, but those
|
||||
# aren't supported with multi-gpu right now anyways.
|
||||
self._update_ops = tf.get_collection(
|
||||
tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
|
||||
self._update_ops = tf1.get_collection(
|
||||
tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)
|
||||
for op in shared_ops:
|
||||
self._update_ops.remove(op) # only care about tower update ops
|
||||
if self._update_ops:
|
||||
logger.debug("Update ops to run on apply gradient: {}".format(
|
||||
self._update_ops))
|
||||
|
||||
with tf.control_dependencies(self._update_ops):
|
||||
with tf1.control_dependencies(self._update_ops):
|
||||
self._train_op = self.optimizer.apply_gradients(avg)
|
||||
|
||||
def load_data(self, sess, inputs, state_inputs):
|
||||
@@ -265,11 +265,11 @@ class LocalSyncParallelOptimizer:
|
||||
def _setup_device(self, device, device_input_placeholders, num_data_in):
|
||||
assert num_data_in <= len(device_input_placeholders)
|
||||
with tf.device(device):
|
||||
with tf.name_scope(TOWER_SCOPE_NAME):
|
||||
with tf1.name_scope(TOWER_SCOPE_NAME):
|
||||
device_input_batches = []
|
||||
device_input_slices = []
|
||||
for i, ph in enumerate(device_input_placeholders):
|
||||
current_batch = tf.Variable(
|
||||
current_batch = tf1.Variable(
|
||||
ph,
|
||||
trainable=False,
|
||||
validate_shape=False,
|
||||
|
||||
@@ -16,7 +16,7 @@ from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.sgd import averaged
|
||||
from ray.rllib.utils.timer import TimerStat
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -115,7 +115,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
with self.workers.local_worker().tf_sess.graph.as_default():
|
||||
with self.workers.local_worker().tf_sess.as_default():
|
||||
for policy_id, policy in self.policies.items():
|
||||
with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
|
||||
with tf1.variable_scope(policy_id, reuse=tf1.AUTO_REUSE):
|
||||
if policy._state_inputs:
|
||||
rnn_inputs = policy._state_inputs + [
|
||||
policy._seq_lens
|
||||
@@ -130,7 +130,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
|
||||
self.per_device_batch_size, policy.copy))
|
||||
|
||||
self.sess = self.workers.local_worker().tf_sess
|
||||
self.sess.run(tf.global_variables_initializer())
|
||||
self.sess.run(tf1.global_variables_initializer())
|
||||
|
||||
@override(PolicyOptimizer)
|
||||
def step(self):
|
||||
|
||||
@@ -14,7 +14,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.tests.mock_worker import _MockWorker
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
class LRScheduleTest(unittest.TestCase):
|
||||
@@ -250,7 +250,7 @@ class AsyncSamplesOptimizerTest(unittest.TestCase):
|
||||
|
||||
def _make_envs(self):
|
||||
def make_sess():
|
||||
return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))
|
||||
return tf1.Session(config=tf1.ConfigProto(device_count={"CPU": 2}))
|
||||
|
||||
local = RolloutWorker(
|
||||
env_creator=lambda _: gym.make("CartPole-v0"),
|
||||
|
||||
@@ -14,7 +14,7 @@ from ray.rllib.utils.debug import summarize
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.tracking_dict import UsageTrackingDict
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -116,7 +116,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
explore = existing_inputs["is_exploring"]
|
||||
timestep = existing_inputs["timestep"]
|
||||
else:
|
||||
obs = tf.placeholder(
|
||||
obs = tf1.placeholder(
|
||||
tf.float32,
|
||||
shape=[None] + list(obs_space.shape),
|
||||
name="observation")
|
||||
@@ -124,11 +124,11 @@ class DynamicTFPolicy(TFPolicy):
|
||||
if self._obs_include_prev_action_reward:
|
||||
prev_actions = ModelCatalog.get_action_placeholder(
|
||||
action_space, "prev_action")
|
||||
prev_rewards = tf.placeholder(
|
||||
prev_rewards = tf1.placeholder(
|
||||
tf.float32, [None], name="prev_reward")
|
||||
explore = tf.placeholder_with_default(
|
||||
explore = tf1.placeholder_with_default(
|
||||
True, (), name="is_exploring")
|
||||
timestep = tf.placeholder(tf.int32, (), name="timestep")
|
||||
timestep = tf1.placeholder(tf.int32, (), name="timestep")
|
||||
|
||||
self._input_dict = {
|
||||
SampleBatch.CUR_OBS: obs,
|
||||
@@ -137,7 +137,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
"is_training": self._get_is_training_placeholder(),
|
||||
}
|
||||
# Placeholder for RNN time-chunk valid lengths.
|
||||
self._seq_lens = tf.placeholder(
|
||||
self._seq_lens = tf1.placeholder(
|
||||
dtype=tf.int32, shape=[None], name="seq_lens")
|
||||
|
||||
dist_class = dist_inputs = None
|
||||
@@ -176,7 +176,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
self._seq_lens = existing_inputs["seq_lens"]
|
||||
else:
|
||||
self._state_in = [
|
||||
tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
|
||||
tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
|
||||
for s in self.model.get_initial_state()
|
||||
]
|
||||
|
||||
@@ -223,7 +223,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
explore=explore)
|
||||
|
||||
# Phase 1 init.
|
||||
sess = tf.get_default_session() or tf.Session()
|
||||
sess = tf1.get_default_session() or tf1.Session()
|
||||
if get_batch_divisibility_req:
|
||||
batch_divisibility_req = get_batch_divisibility_req(self)
|
||||
else:
|
||||
@@ -343,7 +343,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
dummy_batch[k] = fake_array(v)
|
||||
|
||||
# postprocessing might depend on variable init, so run it first here
|
||||
self._sess.run(tf.global_variables_initializer())
|
||||
self._sess.run(tf1.global_variables_initializer())
|
||||
|
||||
postprocessed_batch = self.postprocess_trajectory(
|
||||
SampleBatch(dummy_batch))
|
||||
@@ -380,7 +380,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
continue
|
||||
shape = (None, ) + v.shape[1:]
|
||||
dtype = np.float32 if v.dtype == np.float64 else v.dtype
|
||||
placeholder = tf.placeholder(dtype, shape=shape, name=k)
|
||||
placeholder = tf1.placeholder(dtype, shape=shape, name=k)
|
||||
train_batch[k] = placeholder
|
||||
|
||||
for i, si in enumerate(self._state_in):
|
||||
@@ -402,7 +402,7 @@ class DynamicTFPolicy(TFPolicy):
|
||||
if self._grad_stats_fn:
|
||||
self._stats_fetches.update(
|
||||
self._grad_stats_fn(self, train_batch, self._grads))
|
||||
self._sess.run(tf.global_variables_initializer())
|
||||
self._sess.run(tf1.global_variables_initializer())
|
||||
|
||||
def _do_loss_init(self, train_batch):
|
||||
loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
|
||||
|
||||
@@ -16,7 +16,7 @@ from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.spaces.space_utils import flatten_to_single_ndarray
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@@ -239,7 +239,7 @@ def build_eager_tf_policy(name,
|
||||
)
|
||||
self.exploration = self._create_exploration()
|
||||
self._state_in = [
|
||||
tf.convert_to_tensor(np.array([s]))
|
||||
tf.convert_to_tensor([s])
|
||||
for s in self.model.get_initial_state()
|
||||
]
|
||||
input_dict = {
|
||||
@@ -266,7 +266,7 @@ def build_eager_tf_policy(name,
|
||||
if optimizer_fn:
|
||||
self._optimizer = optimizer_fn(self, config)
|
||||
else:
|
||||
self._optimizer = tf.train.AdamOptimizer(config["lr"])
|
||||
self._optimizer = tf1.train.AdamOptimizer(config["lr"])
|
||||
|
||||
if after_init:
|
||||
after_init(self, observation_space, action_space, config)
|
||||
@@ -618,8 +618,7 @@ def build_eager_tf_policy(name,
|
||||
SampleBatch.DONES: np.array([False], dtype=np.bool),
|
||||
SampleBatch.REWARDS: np.array([0], dtype=np.float32),
|
||||
}
|
||||
if isinstance(self.action_space, Tuple) or isinstance(
|
||||
self.action_space, Dict):
|
||||
if isinstance(self.action_space, (Dict, Tuple)):
|
||||
dummy_batch[SampleBatch.ACTIONS] = [
|
||||
flatten_to_single_ndarray(self.action_space.sample())
|
||||
]
|
||||
@@ -640,7 +639,7 @@ def build_eager_tf_policy(name,
|
||||
dummy_batch["seq_lens"] = np.array([1], dtype=np.int32)
|
||||
|
||||
# Convert everything to tensors.
|
||||
dummy_batch = tf.nest.map_structure(tf.convert_to_tensor,
|
||||
dummy_batch = tf.nest.map_structure(tf1.convert_to_tensor,
|
||||
dummy_batch)
|
||||
|
||||
# for IMPALA which expects a certain sample batch size.
|
||||
|
||||
@@ -20,7 +20,7 @@ from ray.rllib.utils.annotations import DeveloperAPI
|
||||
from ray.rllib.utils.debug import summarize
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -203,7 +203,7 @@ def chop_into_sequences(episode_ids,
|
||||
seq_len = 0
|
||||
unique_ids = np.add(
|
||||
np.add(episode_ids, agent_indices),
|
||||
np.array(unroll_ids) << 32)
|
||||
np.array(unroll_ids, dtype=np.int64) << 32)
|
||||
for uid in unique_ids:
|
||||
if (prev_id is not None and uid != prev_id) or \
|
||||
seq_len >= max_seq_len:
|
||||
|
||||
@@ -11,7 +11,7 @@ from ray.rllib.utils.test_utils import check, framework_iterator
|
||||
from ray.rllib.utils.numpy import one_hot, fc, MIN_LOG_NN_OUTPUT, \
|
||||
MAX_LOG_NN_OUTPUT
|
||||
|
||||
tf = try_import_tf()
|
||||
tf1, tf, tfv = try_import_tf()
|
||||
|
||||
|
||||
def do_test_log_likelihood(run,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user