[RLlib] Tf2x preparation; part 2 (upgrading try_import_tf()). (#9136)

* WIP. * Fixes. * LINT. * WIP. * WIP. * Fixes. * Fixes. * Fixes. * Fixes. * WIP. * Fixes. * Test * Fix. * Fixes and LINT. * Fixes and LINT. * LINT.
2026-06-27 19:00:36 +08:00 · 2020-06-30 10:13:20 +02:00
parent fb074da7c3
commit 43043ee4d5
125 changed files with 617 additions and 584 deletions
@@ -4,7 +4,7 @@ import numpy as np
 from ray.rllib.utils import force_list
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 def unflatten(vector, shapes):
@@ -79,24 +79,29 @@ class TensorFlowVariables:
                variable_names.append(tf_obj.node_def.name)
        self.variables = OrderedDict()
        variable_list = [
-            v for v in tf.global_variables()
+            v for v in tf1.global_variables()
            if v.op.node_def.name in variable_names
        ]
        if input_variables is not None:
            variable_list += input_variables
-        for v in variable_list:
-            self.variables[v.op.node_def.name] = v

-        self.placeholders = {}
-        self.assignment_nodes = {}
+        if not tf1.executing_eagerly():
+            for v in variable_list:
+                self.variables[v.op.node_def.name] = v

-        # Create new placeholders to put in custom weights.
-        for k, var in self.variables.items():
-            self.placeholders[k] = tf.placeholder(
-                var.value().dtype,
-                var.get_shape().as_list(),
-                name="Placeholder_" + k)
-            self.assignment_nodes[k] = var.assign(self.placeholders[k])
+            self.placeholders = {}
+            self.assignment_nodes = {}
+
+            # Create new placeholders to put in custom weights.
+            for k, var in self.variables.items():
+                self.placeholders[k] = tf1.placeholder(
+                    var.value().dtype,
+                    var.get_shape().as_list(),
+                    name="Placeholder_" + k)
+                self.assignment_nodes[k] = var.assign(self.placeholders[k])
+        else:
+            for v in variable_list:
+                self.variables[v.name] = v

    def set_session(self, sess):
        """Sets the current session used by the class.
@@ -117,10 +122,12 @@ class TensorFlowVariables:

    def _check_sess(self):
        """Checks if the session is set, and if not throw an error message."""
-        assert self.sess is not None, ("The session is not set. Set the "
-                                       "session either by passing it into the "
-                                       "TensorFlowVariables constructor or by "
-                                       "calling set_session(sess).")
+        if tf1.executing_eagerly():
+            return
+        assert self.sess is not None, \
+            "The session is not set. Set the session either by passing it " \
+            "into the TensorFlowVariables constructor or by calling " \
+            "set_session(sess)."

    def get_flat(self):
        """Gets the weights and returns them as a flat array.
@@ -129,6 +136,11 @@ class TensorFlowVariables:
            1D Array containing the flattened weights.
        """
        self._check_sess()
+        # Eager mode.
+        if not self.sess:
+            return np.concatenate(
+                [v.numpy().flatten() for v in self.variables.values()])
+        # Graph mode.
        return np.concatenate([
            v.eval(session=self.sess).flatten()
            for v in self.variables.values()
@@ -147,12 +159,16 @@ class TensorFlowVariables:
        self._check_sess()
        shapes = [v.get_shape().as_list() for v in self.variables.values()]
        arrays = unflatten(new_weights, shapes)
-        placeholders = [
-            self.placeholders[k] for k, v in self.variables.items()
-        ]
-        self.sess.run(
-            list(self.assignment_nodes.values()),
-            feed_dict=dict(zip(placeholders, arrays)))
+        if not self.sess:
+            for v, a in zip(self.variables.values(), arrays):
+                v.assign(a)
+        else:
+            placeholders = [
+                self.placeholders[k] for k, v in self.variables.items()
+            ]
+            self.sess.run(
+                list(self.assignment_nodes.values()),
+                feed_dict=dict(zip(placeholders, arrays)))

    def get_weights(self):
        """Returns a dictionary containing the weights of the network.
@@ -161,6 +177,10 @@ class TensorFlowVariables:
            Dictionary mapping variable names to their weights.
        """
        self._check_sess()
+        # Eager mode.
+        if not self.sess:
+            return self.variables
+        # Graph mode.
        return self.sess.run(self.variables)

    def set_weights(self, new_weights):
@@ -344,6 +344,7 @@ py_test(
    args = ["--yaml-dir=tuned_examples/sac", "--torch"]
 )

+
 # TD3
 py_test(
    name = "run_regression_tests_pendulum_td3_tf",
@@ -1013,6 +1014,13 @@ py_test(
    srcs = ["models/tests/test_distributions.py"]
 )

+py_test(
+    name = "test_attention_nets",
+    tags = ["models"],
+    size = "small",
+    srcs = ["models/tests/test_attention_nets.py"]
+)
+
 # --------------------------------------------------------------------
 # Optimizers and Memories
 # rllib/execution/
@@ -9,7 +9,7 @@ from ray.rllib.policy.tf_policy import LearningRateSchedule
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class A3CLoss:
@@ -13,7 +13,7 @@ from ray.rllib.utils.filter import get_filter
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.spaces.space_utils import unbatch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class ARSTFPolicy:
@@ -29,8 +29,8 @@ class ARSTFPolicy:
        self.single_threaded = config.get("single_threaded", False)
        self.sess = make_session(single_threaded=self.single_threaded)

-        self.inputs = tf.placeholder(tf.float32,
-                                     [None] + list(self.preprocessor.shape))
+        self.inputs = tf1.placeholder(tf.float32,
+                                      [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
@@ -52,7 +52,7 @@ class ARSTFPolicy:
        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
-        self.sess.run(tf.global_variables_initializer())
+        self.sess.run(tf1.global_variables_initializer())

    def compute_actions(self,
                        observation,
@@ -3,7 +3,7 @@ import numpy as np
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class DDPGTFModel(TFModelV2):
@@ -22,7 +22,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import huber_loss, minimize_and_clip, \
    make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -126,18 +126,18 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
    target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)

    # Policy network evaluation.
-    with tf.variable_scope(POLICY_SCOPE, reuse=True):
-        # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+    with tf1.variable_scope(POLICY_SCOPE, reuse=True):
+        # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
        policy_t = model.get_policy_output(model_out_t)
        # policy_batchnorm_update_ops = list(
-        #    set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+        #   set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

-    with tf.variable_scope(POLICY_TARGET_SCOPE):
+    with tf1.variable_scope(POLICY_TARGET_SCOPE):
        policy_tp1 = \
            policy.target_model.get_policy_output(target_model_out_tp1)

    # Action outputs.
-    with tf.variable_scope(ACTION_SCOPE, reuse=True):
+    with tf1.variable_scope(ACTION_SCOPE, reuse=True):
        if policy.config["smooth_target_policy"]:
            target_noise_clip = policy.config["target_noise_clip"]
            clipped_normal_sample = tf.clip_by_value(
@@ -154,29 +154,29 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
            policy_tp1_smoothed = policy_tp1

    # Q-net(s) evaluation.
-    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
-    with tf.variable_scope(Q_SCOPE):
+    # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
+    with tf1.variable_scope(Q_SCOPE):
        # Q-values for given actions & observations in given current
        q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])

-    with tf.variable_scope(Q_SCOPE, reuse=True):
+    with tf1.variable_scope(Q_SCOPE, reuse=True):
        # Q-values for current policy (no noise) in given current state
        q_t_det_policy = model.get_q_values(model_out_t, policy_t)

    if twin_q:
-        with tf.variable_scope(TWIN_Q_SCOPE):
+        with tf1.variable_scope(TWIN_Q_SCOPE):
            twin_q_t = model.get_twin_q_values(
                model_out_t, train_batch[SampleBatch.ACTIONS])
    # q_batchnorm_update_ops = list(
-    #     set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+    #     set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

    # Target q-net(s) evaluation.
-    with tf.variable_scope(Q_TARGET_SCOPE):
+    with tf1.variable_scope(Q_TARGET_SCOPE):
        q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
                                                 policy_tp1_smoothed)

    if twin_q:
-        with tf.variable_scope(TWIN_Q_TARGET_SCOPE):
+        with tf1.variable_scope(TWIN_Q_TARGET_SCOPE):
            twin_q_tp1 = policy.target_model.get_twin_q_values(
                target_model_out_tp1, policy_tp1_smoothed)

@@ -220,10 +220,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
    if l2_reg is not None:
        for var in policy.model.policy_variables():
            if "bias" not in var.name:
-                actor_loss += (l2_reg * tf.nn.l2_loss(var))
+                actor_loss += (l2_reg * tf1.nn.l2_loss(var))
        for var in policy.model.q_variables():
            if "bias" not in var.name:
-                critic_loss += (l2_reg * tf.nn.l2_loss(var))
+                critic_loss += (l2_reg * tf1.nn.l2_loss(var))

    # Model self-supervised losses.
    if policy.config["use_state_preprocessor"]:
@@ -259,9 +259,9 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):

 def make_ddpg_optimizers(policy, config):
    # Create separate optimizers for actor & critic losses.
-    policy._actor_optimizer = tf.train.AdamOptimizer(
+    policy._actor_optimizer = tf1.train.AdamOptimizer(
        learning_rate=config["actor_lr"])
-    policy._critic_optimizer = tf.train.AdamOptimizer(
+    policy._critic_optimizer = tf1.train.AdamOptimizer(
        learning_rate=config["critic_lr"])
    return None

@@ -286,7 +286,7 @@ def build_apply_op(policy, optimizer, grads_and_vars):
    # For policy gradient, update policy net one time v.s.
    # update critic net `policy_delay` time(s).
    should_apply_actor_opt = tf.equal(
-        tf.mod(policy.global_step, policy.config["policy_delay"]), 0)
+        tf.math.floormod(policy.global_step, policy.config["policy_delay"]), 0)

    def make_apply_op():
        return policy._actor_optimizer.apply_gradients(
@@ -299,7 +299,7 @@ def build_apply_op(policy, optimizer, grads_and_vars):
    critic_op = policy._critic_optimizer.apply_gradients(
        policy._critic_grads_and_vars)
    # Increment global step & apply ops.
-    with tf.control_dependencies([tf.assign_add(policy.global_step, 1)]):
+    with tf1.control_dependencies([tf1.assign_add(policy.global_step, 1)]):
        return tf.group(actor_op, critic_op)


@@ -341,7 +341,7 @@ def build_ddpg_stats(policy, batch):

 def before_init_fn(policy, obs_space, action_space, config):
    # Create global step for counting the number of update operations.
-    policy.global_step = tf.train.get_or_create_global_step()
+    policy.global_step = tf1.train.get_or_create_global_step()


 class ComputeTDErrorMixin:
@@ -49,10 +49,10 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
    target_model_out_tp1, _ = policy.target_model(input_dict_next, [], None)

    # Policy network evaluation.
-    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+    # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
    policy_t = model.get_policy_output(model_out_t)
    # policy_batchnorm_update_ops = list(
-    #    set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+    #    set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

    policy_tp1 = \
        policy.target_model.get_policy_output(target_model_out_tp1)
@@ -73,7 +73,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
        policy_tp1_smoothed = policy_tp1

    # Q-net(s) evaluation.
-    # prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+    # prev_update_ops = set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS))
    # Q-values for given actions & observations in given current
    q_t = model.get_q_values(model_out_t, train_batch[SampleBatch.ACTIONS])

@@ -86,7 +86,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
        twin_q_t = model.get_twin_q_values(model_out_t,
                                           train_batch[SampleBatch.ACTIONS])
    # q_batchnorm_update_ops = list(
-    #     set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)
+    #     set(tf1.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops)

    # Target q-net(s) evaluation.
    q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
@@ -4,7 +4,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+_, tf, _ = try_import_tf()


 class NoopModel(TFModelV2):
@@ -6,7 +6,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check, check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class TestTD3(unittest.TestCase):
@@ -32,8 +32,9 @@ class TestTD3(unittest.TestCase):

        # Test against all frameworks.
        for _ in framework_iterator(config, frameworks="tf"):
+            lcl_config = config.copy()
            # Default GaussianNoise setup.
-            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
            # Setting explore=False should always return the same action.
            a_ = trainer.compute_action(obs, explore=False)
            for _ in range(50):
@@ -44,9 +45,10 @@ class TestTD3(unittest.TestCase):
            for _ in range(50):
                actions.append(trainer.compute_action(obs))
            check(np.std(actions), 0.0, false=True)
+            trainer.stop()

            # Check randomness at beginning.
-            config["exploration_config"] = {
+            lcl_config["exploration_config"] = {
                # Act randomly at beginning ...
                "random_timesteps": 30,
                # Then act very closely to deterministic actions thereafter.
@@ -54,7 +56,7 @@ class TestTD3(unittest.TestCase):
                "initial_scale": 0.001,
                "final_scale": 0.001,
            }
-            trainer = td3.TD3Trainer(config=config, env="Pendulum-v0")
+            trainer = td3.TD3Trainer(config=lcl_config, env="Pendulum-v0")
            # ts=1 (get a deterministic action as per explore=False).
            deterministic_action = trainer.compute_action(obs, explore=False)
            # ts=2-5 (in random window).
@@ -73,6 +75,7 @@ class TestTD3(unittest.TestCase):
            for _ in range(50):
                a = trainer.compute_action(obs, explore=False)
                check(a, deterministic_action)
+            trainer.stop()


 if __name__ == "__main__":
@@ -3,7 +3,7 @@ import numpy as np
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class DistributionalQTFModel(TFModelV2):
@@ -155,7 +155,7 @@ class DistributionalQTFModel(TFModelV2):
                    units=num_atoms, activation=None)(state_out)
            return state_score

-        if tf.executing_eagerly():
+        if tf1.executing_eagerly():
            from tensorflow.python.ops import variable_scope
            # Have to use a variable store to reuse variables in eager mode
            store = variable_scope.EagerVariableStore()
@@ -163,30 +163,32 @@ class DistributionalQTFModel(TFModelV2):
            # Save the scope objects, since in eager we will execute this
            # path repeatedly and there is no guarantee it will always be run
            # in the same original scope.
-            with tf.variable_scope(name + "/action_value") as action_scope:
+            with tf1.variable_scope(name + "/action_value") as action_scope:
                pass
-            with tf.variable_scope(name + "/state_value") as state_scope:
+            with tf1.variable_scope(name + "/state_value") as state_scope:
                pass

            def build_action_value_in_scope(model_out):
                with store.as_default():
-                    with tf.variable_scope(action_scope, reuse=tf.AUTO_REUSE):
+                    with tf1.variable_scope(
+                            action_scope, reuse=tf1.AUTO_REUSE):
                        return build_action_value(model_out)

            def build_state_score_in_scope(model_out):
                with store.as_default():
-                    with tf.variable_scope(state_scope, reuse=tf.AUTO_REUSE):
+                    with tf1.variable_scope(
+                            state_scope, reuse=tf1.AUTO_REUSE):
                        return build_state_score(model_out)
        else:

            def build_action_value_in_scope(model_out):
-                with tf.variable_scope(
-                        name + "/action_value", reuse=tf.AUTO_REUSE):
+                with tf1.variable_scope(
+                        name + "/action_value", reuse=tf1.AUTO_REUSE):
                    return build_action_value(model_out)

            def build_state_score_in_scope(model_out):
-                with tf.variable_scope(
-                        name + "/state_value", reuse=tf.AUTO_REUSE):
+                with tf1.variable_scope(
+                        name + "/state_value", reuse=tf1.AUTO_REUSE):
                    return build_state_score(model_out)

        q_out = build_action_value_in_scope(self.model_out)
@@ -241,33 +243,33 @@ class DistributionalQTFModel(TFModelV2):
        epsilon_w = tf.matmul(
            a=tf.expand_dims(epsilon_in, -1), b=tf.expand_dims(epsilon_out, 0))
        epsilon_b = epsilon_out
-        sigma_w = tf.get_variable(
+        sigma_w = tf1.get_variable(
            name=prefix + "_sigma_w",
            shape=[in_size, out_size],
            dtype=tf.float32,
-            initializer=tf.random_uniform_initializer(
+            initializer=tf1.random_uniform_initializer(
                minval=-1.0 / np.sqrt(float(in_size)),
                maxval=1.0 / np.sqrt(float(in_size))))
        # TF noise generation can be unreliable on GPU
        # If generating the noise on the CPU,
        # lowering sigma0 to 0.1 may be helpful
-        sigma_b = tf.get_variable(
+        sigma_b = tf1.get_variable(
            name=prefix + "_sigma_b",
            shape=[out_size],
            dtype=tf.float32,  # 0.5~GPU, 0.1~CPU
-            initializer=tf.constant_initializer(
+            initializer=tf1.constant_initializer(
                sigma0 / np.sqrt(float(in_size))))

-        w = tf.get_variable(
+        w = tf1.get_variable(
            name=prefix + "_fc_w",
            shape=[in_size, out_size],
            dtype=tf.float32,
-            initializer=tf.initializers.glorot_uniform())
-        b = tf.get_variable(
+            initializer=tf.initializers.GlorotUniform())
+        b = tf1.get_variable(
            name=prefix + "_fc_b",
            shape=[out_size],
            dtype=tf.float32,
-            initializer=tf.zeros_initializer())
+            initializer=tf.initializers.Zeros())

        action_activation = \
            tf.keras.layers.Lambda(lambda x: tf.matmul(
@@ -17,7 +17,7 @@ from ray.rllib.utils.tf_ops import huber_loss, reduce_mean_ignore_inf, \
    minimize_and_clip
 from ray.rllib.utils.tf_ops import make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 Q_SCOPE = "q_func"
 Q_TARGET_SCOPE = "target_q_func"
@@ -253,7 +253,7 @@ def build_q_losses(policy, model, _, train_batch):


 def adam_optimizer(policy, config):
-    return tf.train.AdamOptimizer(
+    return tf1.train.AdamOptimizer(
        learning_rate=policy.cur_lr, epsilon=config["adam_epsilon"])


@@ -1,7 +1,7 @@
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class SimpleQModel(TFModelV2):
@@ -15,7 +15,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import huber_loss, make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 logger = logging.getLogger(__name__)

 Q_SCOPE = "q_func"
@@ -7,7 +7,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check, check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class TestDQN(unittest.TestCase):
@@ -11,7 +11,7 @@ from ray.rllib.utils.numpy import fc, one_hot, huber_loss
 from ray.rllib.utils.test_utils import check, check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class TestSimpleQ(unittest.TestCase):
@@ -14,7 +14,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space, \
    unbatch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 tree = try_import_tree()


@@ -60,9 +60,9 @@ def rollout(policy, env, timestep_limit=None, add_noise=False, offset=0.0):

 def make_session(single_threaded):
    if not single_threaded:
-        return tf.Session()
-    return tf.Session(
-        config=tf.ConfigProto(
+        return tf1.Session()
+    return tf1.Session(
+        config=tf1.ConfigProto(
            inter_op_parallelism_threads=1, intra_op_parallelism_threads=1))


@@ -77,8 +77,8 @@ class ESTFPolicy:
                                             self.preprocessor.shape)
        self.single_threaded = config.get("single_threaded", False)
        self.sess = make_session(single_threaded=self.single_threaded)
-        self.inputs = tf.placeholder(tf.float32,
-                                     [None] + list(self.preprocessor.shape))
+        self.inputs = tf1.placeholder(tf.float32,
+                                      [None] + list(self.preprocessor.shape))

        # Policy network.
        dist_class, dist_dim = ModelCatalog.get_action_dist(
@@ -98,7 +98,7 @@ class ESTFPolicy:
        self.num_params = sum(
            np.prod(variable.shape.as_list())
            for _, variable in self.variables.variables.items())
-        self.sess.run(tf.global_variables_initializer())
+        self.sess.run(tf1.global_variables_initializer())

    def compute_actions(self,
                        observation,
@@ -6,7 +6,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class TestIMPALA(unittest.TestCase):
@@ -30,7 +30,7 @@ from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.numpy import softmax
 from ray.rllib.utils.test_utils import check, framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -185,20 +185,20 @@ class VtraceTest(unittest.TestCase):
                # can deal with that.
                inputs_ = {
                    # T, B, NUM_ACTIONS
-                    "behaviour_policy_logits": tf.placeholder(
+                    "behaviour_policy_logits": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, None]),
                    # T, B, NUM_ACTIONS
-                    "target_policy_logits": tf.placeholder(
+                    "target_policy_logits": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, None]),
-                    "actions": tf.placeholder(
+                    "actions": tf1.placeholder(
                        dtype=tf.int32, shape=[None, None]),
-                    "discounts": tf.placeholder(
+                    "discounts": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None]),
-                    "rewards": tf.placeholder(
+                    "rewards": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None]),
-                    "values": tf.placeholder(
+                    "values": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None]),
-                    "bootstrap_value": tf.placeholder(
+                    "bootstrap_value": tf1.placeholder(
                        dtype=tf.float32, shape=[None]),
                }
            else:
@@ -282,15 +282,15 @@ class VtraceTest(unittest.TestCase):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            if fw == "tf":
                inputs_ = {
-                    "log_rhos": tf.placeholder(
+                    "log_rhos": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]),
-                    "discounts": tf.placeholder(
+                    "discounts": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]),
-                    "rewards": tf.placeholder(
+                    "rewards": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 42]),
-                    "values": tf.placeholder(
+                    "values": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 42]),
-                    "bootstrap_value": tf.placeholder(
+                    "bootstrap_value": tf1.placeholder(
                        dtype=tf.float32, shape=[None, 42])
                }
            else:
@@ -310,16 +310,16 @@ class VtraceTest(unittest.TestCase):
            vtrace = vtrace_tf if fw != "torch" else vtrace_torch
            if fw == "tf":
                inputs_ = {
-                    "log_rhos": tf.placeholder(
+                    "log_rhos": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]),
-                    "discounts": tf.placeholder(
+                    "discounts": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 1]),
-                    "rewards": tf.placeholder(
+                    "rewards": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 42]),
-                    "values": tf.placeholder(
+                    "values": tf1.placeholder(
                        dtype=tf.float32, shape=[None, None, 42]),
                    # Should be [None, 42].
-                    "bootstrap_value": tf.placeholder(
+                    "bootstrap_value": tf1.placeholder(
                        dtype=tf.float32, shape=[None])
                }
            else:
@@ -33,7 +33,7 @@ import collections
 from ray.rllib.models.tf.tf_action_dist import Categorical
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 VTraceFromLogitsReturns = collections.namedtuple("VTraceFromLogitsReturns", [
    "vs", "pg_advantages", "log_rhos", "behaviour_action_log_probs",
@@ -222,7 +222,7 @@ def multi_from_logits(behaviour_policy_logits,
        behaviour_policy_logits[i].shape.assert_has_rank(3)
        target_policy_logits[i].shape.assert_has_rank(3)

-    with tf.name_scope(
+    with tf1.name_scope(
            name,
            values=[
                behaviour_policy_logits, target_policy_logits, actions,
@@ -332,21 +332,22 @@ def from_importance_weights(log_rhos,
    if clip_pg_rho_threshold is not None:
        clip_pg_rho_threshold.shape.assert_has_rank(0)

-    with tf.name_scope(
+    with tf1.name_scope(
            name,
            values=[log_rhos, discounts, rewards, values, bootstrap_value]):
-        rhos = tf.exp(log_rhos)
+        rhos = tf.math.exp(log_rhos)
        if clip_rho_threshold is not None:
            clipped_rhos = tf.minimum(
                clip_rho_threshold, rhos, name="clipped_rhos")

-            tf.summary.histogram("clipped_rhos_1000", tf.minimum(1000.0, rhos))
-            tf.summary.scalar(
+            tf1.summary.histogram(
+                    "clipped_rhos_1000", tf.minimum(1000.0, rhos))
+            tf1.summary.scalar(
                "num_of_clipped_rhos",
                tf.reduce_sum(
                    tf.cast(
                        tf.equal(clipped_rhos, clip_rho_threshold), tf.int32)))
-            tf.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
+            tf1.summary.scalar("size_of_clipped_rhos", tf.size(clipped_rhos))
        else:
            clipped_rhos = rhos

@@ -16,7 +16,7 @@ from ray.rllib.policy.tf_policy import LearningRateSchedule, \
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import explained_variance

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -253,10 +253,11 @@ def postprocess_trajectory(policy,

 def choose_optimizer(policy, config):
    if policy.config["opt_type"] == "adam":
-        return tf.train.AdamOptimizer(policy.cur_lr)
+        return tf1.train.AdamOptimizer(policy.cur_lr)
    else:
-        return tf.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
-                                         config["momentum"], config["epsilon"])
+        return tf1.train.RMSPropOptimizer(
+            policy.cur_lr,
+            config["decay"], config["momentum"], config["epsilon"])


 def clip_gradients(policy, optimizer, loss):
@@ -9,7 +9,7 @@ from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
    vf_preds_fetches, clip_gradients, setup_config, ValueNetworkMixin
 from ray.rllib.utils.framework import get_activation_fn

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -33,7 +33,7 @@ def PPOLoss(dist_class,
        pi_new_logp = curr_dist.logp(actions)
        pi_old_logp = prev_dist.logp(actions)

-        logp_ratio = tf.exp(pi_new_logp - pi_old_logp)
+        logp_ratio = tf.math.exp(pi_new_logp - pi_old_logp)
        if clip_loss:
            return tf.minimum(
                advantages * logp_ratio,
@@ -49,10 +49,10 @@ def PPOLoss(dist_class,

    def vf_loss(value_fn, value_targets, vf_preds, vf_clip_param=0.1):
        # GAE Value Function Loss
-        vf_loss1 = tf.square(value_fn - value_targets)
+        vf_loss1 = tf.math.square(value_fn - value_targets)
        vf_clipped = vf_preds + tf.clip_by_value(value_fn - vf_preds,
                                                 -vf_clip_param, vf_clip_param)
-        vf_loss2 = tf.square(vf_clipped - value_targets)
+        vf_loss2 = tf.math.square(vf_clipped - value_targets)
        vf_loss = tf.maximum(vf_loss1, vf_loss2)
        return vf_loss

@@ -104,7 +104,7 @@ class WorkerLoss(object):
            vf_clip_param=vf_clip_param,
            vf_loss_coeff=vf_loss_coeff,
            clip_loss=clip_loss)
-        self.loss = tf.Print(self.loss, ["Worker Adapt Loss", self.loss])
+        self.loss = tf1.Print(self.loss, ["Worker Adapt Loss", self.loss])


 # This is the Meta-Update computation graph for main (meta-update step)
@@ -230,7 +230,7 @@ class MAMLLoss(object):
            tf.multiply(self.cur_kl_coeff, mean_inner_kl))
        self.loss = tf.reduce_mean(tf.stack(ppo_obj,
                                            axis=0)) + self.inner_kl_loss
-        self.loss = tf.Print(
+        self.loss = tf1.Print(
            self.loss,
            ["Meta-Loss", self.loss, "Inner KL", self.mean_inner_kl])

@@ -309,7 +309,7 @@ class MAMLLoss(object):
 def maml_loss(policy, model, dist_class, train_batch):
    logits, state = model.from_batch(train_batch)

-    policy._loss_input_dict["split"] = tf.placeholder(
+    policy._loss_input_dict["split"] = tf1.placeholder(
        tf.int32,
        name="Meta-Update-Splitting",
        shape=(policy.config["inner_adaptation_steps"] + 1,
@@ -333,8 +333,8 @@ def maml_loss(policy, model, dist_class, train_batch):
            vf_loss_coeff=policy.config["vf_loss_coeff"],
            clip_loss=False)
    else:
-        policy.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
-                                            tf.get_variable_scope().name)
+        policy.var_list = tf1.get_collection(tf1.GraphKeys.TRAINABLE_VARIABLES,
+                                             tf1.get_variable_scope().name)
        policy.loss_obj = MAMLLoss(
            model=model,
            dist_class=dist_class,
@@ -380,8 +380,8 @@ class KLCoeffMixin:
        self.kl_coeff_val = [config["kl_coeff"]
                             ] * config["inner_adaptation_steps"]
        self.kl_target = self.config["kl_target"]
-        self.kl_coeff = tf.get_variable(
-            initializer=tf.constant_initializer(self.kl_coeff_val),
+        self.kl_coeff = tf1.get_variable(
+            initializer=tf.keras.initializers.Constant(self.kl_coeff_val),
            name="kl_coeff",
            shape=(config["inner_adaptation_steps"]),
            trainable=False,
@@ -404,8 +404,8 @@ def maml_optimizer_fn(policy, config):
    Meta-Policy uses Adam optimizer for meta-update
    """
    if not config["worker_index"]:
-        return tf.train.AdamOptimizer(learning_rate=config["lr"])
-    return tf.train.GradientDescentOptimizer(learning_rate=config["inner_lr"])
+        return tf1.train.AdamOptimizer(learning_rate=config["lr"])
+    return tf1.train.GradientDescentOptimizer(learning_rate=config["inner_lr"])


 def setup_mixins(policy, obs_space, action_space, config):
@@ -6,7 +6,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class ValueNetworkMixin:
@@ -37,13 +37,13 @@ class ReweightedImitationLoss:
        # advantage estimation
        adv = cumulative_rewards - state_values
        # update averaged advantage norm
-        update_adv_norm = tf.assign_add(
+        update_adv_norm = tf1.assign_add(
            ref=policy._ma_adv_norm,
            value=1e-6 * (
                    tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))

        # exponentially weighted advantages
-        with tf.control_dependencies([update_adv_norm]):
+        with tf1.control_dependencies([update_adv_norm]):
            exp_advs = tf.math.exp(beta * tf.math.divide(
                adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))

@@ -125,7 +125,7 @@ def setup_mixins(policy, obs_space, action_space, config):
    ValueNetworkMixin.__init__(policy)
    # Set up a tf-var for the moving avg (do this here to make it work with
    # eager mode).
-    policy._ma_adv_norm = tf.get_variable(
+    policy._ma_adv_norm = tf1.get_variable(
        name="moving_average_of_advantage_norm",
        dtype=tf.float32,
        initializer=100.0,
@@ -6,7 +6,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class TestMARWIL(unittest.TestCase):
@@ -5,7 +5,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 def post_process_advantages(policy,
@@ -21,7 +21,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 POLICY_SCOPE = "func"
 TARGET_POLICY_SCOPE = "target_func"
@@ -65,7 +65,7 @@ class PPOSurrogateLoss:
        def reduce_mean_valid(t):
            return tf.reduce_mean(tf.boolean_mask(t, valid_mask))

-        logp_ratio = tf.exp(actions_logp - prev_actions_logp)
+        logp_ratio = tf.math.exp(actions_logp - prev_actions_logp)

        surrogate_loss = tf.minimum(
            advantages * logp_ratio,
@@ -170,7 +170,7 @@ class VTraceSurrogateLoss:
                                              tf.float32))

        self.is_ratio = tf.clip_by_value(
-            tf.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
+            tf.math.exp(prev_actions_logp - old_policy_actions_logp), 0.0, 2.0)
        logp_ratio = self.is_ratio * tf.exp(actions_logp - prev_actions_logp)

        advantages = self.vtrace_returns.pg_advantages
@@ -7,9 +7,6 @@ from ray.rllib.execution.rollout_ops import ParallelRollouts, ConcatBatches, \
    StandardizeFields, SelectExperiences
 from ray.rllib.execution.train_ops import TrainOneStep, TrainTFMultiGPU
 from ray.rllib.execution.metric_ops import StandardMetricsReporting
-from ray.rllib.utils.framework import try_import_tf
-
-tf = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -10,7 +10,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -174,7 +174,7 @@ def postprocess_ppo_gae(policy,
    else:
        next_state = []
        for i in range(policy.num_state_tensors()):
-            next_state.append([sample_batch["state_out_{}".format(i)][-1]])
+            next_state.append(sample_batch["state_out_{}".format(i)][-1])
        last_r = policy._value(sample_batch[SampleBatch.NEXT_OBS][-1],
                               sample_batch[SampleBatch.ACTIONS][-1],
                               sample_batch[SampleBatch.REWARDS][-1],
@@ -206,7 +206,7 @@ class KLCoeffMixin:
        # KL Coefficient
        self.kl_coeff_val = config["kl_coeff"]
        self.kl_target = config["kl_target"]
-        self.kl_coeff = tf.get_variable(
+        self.kl_coeff = tf1.get_variable(
            initializer=tf.constant_initializer(self.kl_coeff_val),
            name="kl_coeff",
            shape=(),
@@ -194,7 +194,7 @@ class ValueNetworkMixin:
                    SampleBatch.PREV_REWARDS: convert_to_torch_tensor(
                        np.asarray([prev_reward])),
                    "is_training": False,
-                }, [convert_to_torch_tensor(np.asarray(s)) for s in state],
+                }, [convert_to_torch_tensor(np.asarray([s])) for s in state],
                    convert_to_torch_tensor(np.asarray([1])))
                return self.model.value_function()[0]

@@ -2,12 +2,9 @@ import unittest

 import ray
 import ray.rllib.agents.ppo as ppo
-from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
-

 class TestAPPO(unittest.TestCase):
    @classmethod
@@ -2,12 +2,9 @@ import unittest

 import ray
 import ray.rllib.agents.ppo as ppo
-from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_compute_single_action, \
    framework_iterator

-tf = try_import_tf()
-

 class TestDDPPO(unittest.TestCase):
    @classmethod
@@ -13,12 +13,10 @@ from ray.rllib.models.tf.tf_action_dist import Categorical
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.numpy import fc
 from ray.rllib.utils.test_utils import check, framework_iterator, \
    check_compute_single_action

-tf = try_import_tf()

 # Fake CartPole episode of n time steps.
 FAKE_BATCH = {
@@ -40,7 +38,7 @@ FAKE_BATCH = {
 class TestPPO(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
-        ray.init()
+        ray.init(local_mode=True)

    @classmethod
    def tearDownClass(cls):
@@ -4,7 +4,7 @@ import numpy as np
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class SACTFModel(TFModelV2):
@@ -17,7 +17,7 @@ from ray.rllib.utils.error import UnsupportedSpaceException
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
 from ray.rllib.utils.tf_ops import minimize_and_clip

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 tfp = try_import_tfp()

 logger = logging.getLogger(__name__)
@@ -138,10 +138,10 @@ def sac_actor_critic_loss(policy, model, _, train_batch):
    if model.discrete:
        # Get all action probs directly from pi and form their logp.
        log_pis_t = tf.nn.log_softmax(model.get_policy_output(model_out_t), -1)
-        policy_t = tf.exp(log_pis_t)
+        policy_t = tf.math.exp(log_pis_t)
        log_pis_tp1 = tf.nn.log_softmax(
            model.get_policy_output(model_out_tp1), -1)
-        policy_tp1 = tf.exp(log_pis_tp1)
+        policy_tp1 = tf.math.exp(log_pis_tp1)
        # Q-values.
        q_t = model.get_q_values(model_out_t)
        # Target Q-values.
@@ -219,20 +219,20 @@ def sac_actor_critic_loss(policy, model, _, train_batch):
        policy.config["gamma"]**policy.config["n_step"] * q_tp1_best_masked)

    # Compute the TD-error (potentially clipped).
-    base_td_error = tf.abs(q_t_selected - q_t_selected_target)
+    base_td_error = tf.math.abs(q_t_selected - q_t_selected_target)
    if policy.config["twin_q"]:
-        twin_td_error = tf.abs(twin_q_t_selected - q_t_selected_target)
+        twin_td_error = tf.math.abs(twin_q_t_selected - q_t_selected_target)
        td_error = 0.5 * (base_td_error + twin_td_error)
    else:
        td_error = base_td_error

    critic_loss = [
-        tf.losses.mean_squared_error(
+        tf1.losses.mean_squared_error(
            labels=q_t_selected_target, predictions=q_t_selected, weights=0.5)
    ]
    if policy.config["twin_q"]:
        critic_loss.append(
-            tf.losses.mean_squared_error(
+            tf1.losses.mean_squared_error(
                labels=q_t_selected_target,
                predictions=twin_q_t_selected,
                weights=0.5))
@@ -274,7 +274,7 @@ def sac_actor_critic_loss(policy, model, _, train_batch):

    # in a custom apply op we handle the losses separately, but return them
    # combined in one loss for now
-    return actor_loss + tf.add_n(critic_loss) + alpha_loss
+    return actor_loss + tf.math.add_n(critic_loss) + alpha_loss


 def gradients(policy, optimizer, loss):
@@ -358,7 +358,7 @@ def apply_gradients(policy, optimizer, grads_and_vars):

    alpha_apply_ops = policy._alpha_optimizer.apply_gradients(
        policy._alpha_grads_and_vars,
-        global_step=tf.train.get_or_create_global_step())
+        global_step=tf1.train.get_or_create_global_step())
    return tf.group([actor_apply_ops, alpha_apply_ops] + critic_apply_ops)


@@ -381,20 +381,20 @@ def stats(policy, train_batch):
 class ActorCriticOptimizerMixin:
    def __init__(self, config):
        # create global step for counting the number of update operations
-        self.global_step = tf.train.get_or_create_global_step()
+        self.global_step = tf1.train.get_or_create_global_step()

        # use separate optimizers for actor & critic
-        self._actor_optimizer = tf.train.AdamOptimizer(
+        self._actor_optimizer = tf1.train.AdamOptimizer(
            learning_rate=config["optimization"]["actor_learning_rate"])
        self._critic_optimizer = [
-            tf.train.AdamOptimizer(
+            tf1.train.AdamOptimizer(
                learning_rate=config["optimization"]["critic_learning_rate"])
        ]
        if config["twin_q"]:
            self._critic_optimizer.append(
-                tf.train.AdamOptimizer(learning_rate=config["optimization"][
+                tf1.train.AdamOptimizer(learning_rate=config["optimization"][
                    "critic_learning_rate"]))
-        self._alpha_optimizer = tf.train.AdamOptimizer(
+        self._alpha_optimizer = tf1.train.AdamOptimizer(
            learning_rate=config["optimization"]["entropy_learning_rate"])


@@ -11,13 +11,12 @@ from ray.rllib.models.tf.tf_action_dist import SquashedGaussian
 from ray.rllib.models.torch.torch_action_dist import TorchSquashedGaussian
 from ray.rllib.execution.replay_buffer import LocalReplayBuffer
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.framework import try_import_torch
 from ray.rllib.utils.numpy import fc, relu
 from ray.rllib.utils.test_utils import check, check_compute_single_action, \
    framework_iterator
 from ray.rllib.utils.torch_ops import convert_to_torch_tensor

-tf = try_import_tf()
 torch, _ = try_import_torch()


@@ -35,7 +35,7 @@ from ray.tune.resources import Resources
 from ray.tune.logger import Logger, UnifiedLogger
 from ray.tune.result import DEFAULT_RESULTS_DIR

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -595,12 +595,12 @@ class Trainer(Trainable):
            self.config.pop("eager")

        # Enable eager/tracing support.
-        if tf and self.config["framework"] == "tfe":
-            if not tf.executing_eagerly():
-                tf.enable_eager_execution()
+        if tf1 and self.config["framework"] == "tfe":
+            if not tf1.executing_eagerly():
+                tf1.enable_eager_execution()
            logger.info("Executing eagerly, with eager_tracing={}".format(
                self.config["eager_tracing"]))
-        if tf and not tf.executing_eagerly() and \
+        if tf1 and not tf1.executing_eagerly() and \
                self.config["framework"] != "torch":
            logger.info("Tip: set framework=tfe or the --eager flag to enable "
                        "TensorFlow eager execution")
@@ -634,8 +634,8 @@ class Trainer(Trainable):
            logging.getLogger("ray.rllib").setLevel(self.config["log_level"])

        def get_scope():
-            if tf and not tf.executing_eagerly():
-                return tf.Graph().as_default()
+            if tf1 and not tf1.executing_eagerly():
+                return tf1.Graph().as_default()
            else:
                return open(os.devnull)  # fake a no-op scope

@@ -12,14 +12,13 @@ from ray.rllib.execution.metric_ops import StandardMetricsReporting
 from ray.rllib.models.catalog import ModelCatalog
 from ray.rllib.models.model import restore_original_dimensions
 from ray.rllib.models.torch.torch_action_dist import TorchCategorical
-from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.framework import try_import_torch
 from ray.tune.registry import ENV_CREATOR, _global_registry

 from ray.rllib.contrib.alpha_zero.core.alpha_zero_policy import AlphaZeroPolicy
 from ray.rllib.contrib.alpha_zero.core.mcts import MCTS
 from ray.rllib.contrib.alpha_zero.core.ranked_rewards import get_r2_env_wrapper

-tf = try_import_tf()
 torch, nn = try_import_torch()

 logger = logging.getLogger(__name__)
@@ -15,7 +15,7 @@ import numpy as np

 logger = logging.getLogger(__name__)

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 tfp = try_import_tfp()


@@ -49,7 +49,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
        # _____ Initial Configuration
        config = dict(ray.rllib.contrib.maddpg.DEFAULT_CONFIG, **config)
        self.config = config
-        self.global_step = tf.train.get_or_create_global_step()
+        self.global_step = tf1.train.get_or_create_global_step()

        # FIXME: Get done from info is required since agentwise done is not
        # supported now.
@@ -88,7 +88,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
        # Placeholders for policy evaluation and updates
        def _make_ph_n(space_n, name=""):
            return [
-                tf.placeholder(
+                tf1.placeholder(
                    tf.float32,
                    shape=(None, ) + space.shape,
                    name=name + "_%d" % i) for i, space in enumerate(space_n)
@@ -98,9 +98,9 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
        act_ph_n = _make_ph_n(act_space_n, "actions")
        new_obs_ph_n = _make_ph_n(obs_space_n, "new_obs")
        new_act_ph_n = _make_ph_n(act_space_n, "new_actions")
-        rew_ph = tf.placeholder(
+        rew_ph = tf1.placeholder(
            tf.float32, shape=None, name="rewards_{}".format(agent_id))
-        done_ph = tf.placeholder(
+        done_ph = tf1.placeholder(
            tf.float32, shape=None, name="dones_{}".format(agent_id))

        if config["use_local_critic"]:
@@ -190,12 +190,12 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):

        # _____ Optimizers
        self.optimizers = {
-            "critic": tf.train.AdamOptimizer(config["critic_lr"]),
-            "actor": tf.train.AdamOptimizer(config["actor_lr"])
+            "critic": tf1.train.AdamOptimizer(config["critic_lr"]),
+            "actor": tf1.train.AdamOptimizer(config["actor_lr"])
        }

        # _____ Build variable update ops.
-        self.tau = tf.placeholder_with_default(
+        self.tau = tf1.placeholder_with_default(
            config["tau"], shape=(), name="tau")

        def _make_target_update_op(vs, target_vs, tau):
@@ -213,7 +213,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
            for v in variables.values():
                vs += v
            phs = [
-                tf.placeholder(
+                tf1.placeholder(
                    tf.float32,
                    shape=v.get_shape(),
                    name=v.name.split(":")[0] + "_ph") for v in vs
@@ -230,7 +230,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):

        # _____ TensorFlow Initialization

-        self.sess = tf.get_default_session()
+        self.sess = tf1.get_default_session()

        def _make_loss_inputs(placeholders):
            return [(ph.name.split("/")[-1].split(":")[0], ph)
@@ -251,7 +251,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
            loss_inputs=loss_inputs,
            dist_inputs=actor_feature)

-        self.sess.run(tf.global_variables_initializer())
+        self.sess.run(tf1.global_variables_initializer())

        # Hard initial update
        self.update_target(1.0)
@@ -280,8 +280,8 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
        critic_apply_op = self.optimizers["critic"].apply_gradients(
            self.gvs["critic"])

-        with tf.control_dependencies([tf.assign_add(self.global_step, 1)]):
-            with tf.control_dependencies([critic_apply_op]):
+        with tf1.control_dependencies([tf1.assign_add(self.global_step, 1)]):
+            with tf1.control_dependencies([critic_apply_op]):
                actor_apply_op = self.optimizers["actor"].apply_gradients(
                    self.gvs["actor"])

@@ -324,7 +324,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
                              hiddens,
                              activation=None,
                              scope=None):
-        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
+        with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model_n = [
                    ModelCatalog.get_model({
@@ -341,11 +341,12 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
                out = tf.concat(obs_n + act_n, axis=1)

            for hidden in hiddens:
-                out = tf.layers.dense(out, units=hidden, activation=activation)
+                out = tf1.layers.dense(
+                    out, units=hidden, activation=activation)
            feature = out
-            out = tf.layers.dense(feature, units=1, activation=None)
+            out = tf1.layers.dense(feature, units=1, activation=None)

-        return out, feature, model_n, tf.global_variables(scope.name)
+        return out, feature, model_n, tf1.global_variables(scope.name)

    def _build_actor_network(self,
                             obs,
@@ -355,7 +356,7 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
                             hiddens,
                             activation=None,
                             scope=None):
-        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE) as scope:
+        with tf1.variable_scope(scope, reuse=tf1.AUTO_REUSE) as scope:
            if use_state_preprocessor:
                model = ModelCatalog.get_model({
                    "obs": obs,
@@ -367,13 +368,14 @@ class MADDPGTFPolicy(MADDPGPostprocessing, TFPolicy):
                out = obs

            for hidden in hiddens:
-                out = tf.layers.dense(out, units=hidden, activation=activation)
-            feature = tf.layers.dense(
+                out = tf1.layers.dense(
+                    out, units=hidden, activation=activation)
+            feature = tf1.layers.dense(
                out, units=act_space.shape[0], activation=None)
            sampler = tfp.distributions.RelaxedOneHotCategorical(
                temperature=1.0, logits=feature).sample()

-        return sampler, feature, model, tf.global_variables(scope.name)
+        return sampler, feature, model, tf1.global_variables(scope.name)

    def update_target(self, tau=None):
        if tau is not None:
@@ -50,7 +50,7 @@ if TYPE_CHECKING:
 # Generic type var for foreach_* methods.
 T = TypeVar("T")

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()

 logger = logging.getLogger(__name__)
@@ -283,12 +283,12 @@ class RolloutWorker(ParallelIteratorWorker):
        ParallelIteratorWorker.__init__(self, gen_rollouts, False)

        policy_config: TrainerConfigDict = policy_config or {}
-        if (tf and policy_config.get("framework") == "tfe"
+        if (tf1 and policy_config.get("framework") == "tfe"
                and not policy_config.get("no_eager_on_workers")
                # This eager check is necessary for certain all-framework tests
                # that use tf's eager_mode() context generator.
-                and not tf.executing_eagerly()):
-            tf.enable_eager_execution()
+                and not tf1.executing_eagerly()):
+            tf1.enable_eager_execution()

        if log_level:
            logging.getLogger("ray.rllib").setLevel(log_level)
@@ -382,21 +382,21 @@ class RolloutWorker(ParallelIteratorWorker):
                torch.manual_seed(seed)
            except AssertionError:
                logger.info("Could not seed torch")
-        if _has_tensorflow_graph(policy_dict) and not (tf and
-                                                       tf.executing_eagerly()):
-            if not tf:
+        if _has_tensorflow_graph(policy_dict) and not (
+                tf1 and tf1.executing_eagerly()):
+            if not tf1:
                raise ImportError("Could not import tensorflow")
-            with tf.Graph().as_default():
+            with tf1.Graph().as_default():
                if tf_session_creator:
                    self.tf_sess = tf_session_creator()
                else:
-                    self.tf_sess = tf.Session(
-                        config=tf.ConfigProto(
-                            gpu_options=tf.GPUOptions(allow_growth=True)))
+                    self.tf_sess = tf1.Session(
+                        config=tf1.ConfigProto(
+                            gpu_options=tf1.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
                    # set graph-level seed
                    if seed is not None:
-                        tf.set_random_seed(seed)
+                        tf1.set_random_seed(seed)
                    self.policy_map, self.preprocessors = \
                        self._build_policy_map(policy_dict, policy_config)
            if (ray.is_initialized()
@@ -406,7 +406,7 @@ class RolloutWorker(ParallelIteratorWorker):
                        "Creating policy evaluation worker {}".format(
                            worker_index) +
                        " on CPU (please ignore any CUDA init errors)")
-                elif not tf.test.is_gpu_available():
+                elif not tf1.test.is_gpu_available():
                    raise RuntimeError(
                        "GPUs were assigned to this worker by Ray, but "
                        "TensorFlow reports GPU acceleration is disabled. "
@@ -956,7 +956,7 @@ class RolloutWorker(ParallelIteratorWorker):
                    "Found raw Tuple|Dict space as input to policy. "
                    "Please preprocess these observations with a "
                    "Tuple|DictFlatteningPreprocessor.")
-            if tf and tf.executing_eagerly():
+            if tf1 and tf1.executing_eagerly():
                if hasattr(cls, "as_eager"):
                    cls = cls.as_eager()
                    if policy_config["eager_tracing"]:
@@ -966,8 +966,8 @@ class RolloutWorker(ParallelIteratorWorker):
                else:
                    raise ValueError("This policy does not support eager "
                                     "execution: {}".format(cls))
-            if tf:
-                with tf.variable_scope(name):
+            if tf1:
+                with tf1.variable_scope(name):
                    policy_map[name] = cls(obs_space, act_space, merged_conf)
            else:
                policy_map[name] = cls(obs_space, act_space, merged_conf)
@@ -14,7 +14,7 @@ from ray.rllib.utils import merge_dicts
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.types import PolicyID, TrainerConfigDict, EnvType

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -202,8 +202,8 @@ class WorkerSet:
        def session_creator():
            logger.debug("Creating TF session {}".format(
                config["tf_session_args"]))
-            return tf.Session(
-                config=tf.ConfigProto(**config["tf_session_args"]))
+            return tf1.Session(
+                config=tf1.ConfigProto(**config["tf_session_args"]))

        if isinstance(config["input"], FunctionType):
            input_creator = config["input"]
@@ -11,7 +11,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.tune import registry

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()
 parser.add_argument("--run", type=str, default="PPO")
@@ -4,7 +4,7 @@ import numpy as np
 from rllib.models.tf.attention_net import TrXLNet
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 def bit_shift_generator(seq_length, shift, batch_size):
@@ -10,7 +10,7 @@ from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_learning_achieved

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()
 parser.add_argument("--run", type=str, default="PPO")
@@ -39,7 +39,7 @@ from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.rllib.utils.tf_ops import explained_variance, make_tf_callable
 from ray.rllib.utils.torch_ops import convert_to_torch_tensor

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()

 OPPONENT_OBS = "opponent_obs"
@@ -23,7 +23,7 @@ from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.test_utils import check_learning_achieved

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()

 parser = argparse.ArgumentParser()
@@ -12,7 +12,7 @@ from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.models.tf.visionnet import VisionNetwork as MyVisionNetwork
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()
 parser.add_argument("--run", type=str, default="DQN")  # Try PG, PPO, DQN
@@ -21,7 +21,7 @@ from ray.rllib.examples.models.custom_loss_model import CustomLossModel, \
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()
 parser.add_argument("--torch", action="store_true")
@@ -7,7 +7,7 @@ from ray.rllib.evaluation.postprocessing import discount
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()
 parser.add_argument("--stop-iters", type=int, default=200)
@@ -11,7 +11,7 @@ from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_learning_achieved

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()
 parser.add_argument("--stop-iters", type=int, default=200)
@@ -6,7 +6,7 @@ import ray
 from ray.rllib.agents.registry import get_agent_class
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 ray.init(num_cpus=10)

@@ -25,14 +25,14 @@ def train_and_export(algo_name, num_steps, model_dir, ckpt_dir, prefix):

 def restore_saved_model(export_dir):
    signature_key = \
-        tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-    g = tf.Graph()
+        tf1.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    g = tf1.Graph()
    with g.as_default():
-        with tf.Session(graph=g) as sess:
+        with tf1.Session(graph=g) as sess:
            meta_graph_def = \
-                tf.saved_model.load(sess,
-                                    [tf.saved_model.tag_constants.SERVING],
-                                    export_dir)
+                tf1.saved_model.load(sess,
+                                     [tf1.saved_model.tag_constants.SERVING],
+                                     export_dir)
            print("Model restored!")
            print("Signature Def Information:")
            print(meta_graph_def.signature_def[signature_key])
@@ -41,13 +41,13 @@ def restore_saved_model(export_dir):


 def restore_checkpoint(export_dir, prefix):
-    sess = tf.Session()
+    sess = tf1.Session()
    meta_file = "%s.meta" % prefix
-    saver = tf.train.import_meta_graph(os.path.join(export_dir, meta_file))
+    saver = tf1.train.import_meta_graph(os.path.join(export_dir, meta_file))
    saver.restore(sess, os.path.join(export_dir, prefix))
    print("Checkpoint restored!")
    print("Variables Information:")
-    for v in tf.trainable_variables():
+    for v in tf1.trainable_variables():
        value = sess.run(v)
        print(v.name, value)

@@ -13,7 +13,7 @@ from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 cnn_shape = (4, 4, 3)
 # The torch version of MobileNetV2 does channels first.
@@ -3,7 +3,7 @@ from ray.rllib.models.torch.torch_action_dist import TorchCategorical, \
    TorchDistributionWrapper
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -7,7 +7,7 @@ from ray.rllib.models.torch.misc import SlimFC
 from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -9,7 +9,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -39,27 +39,27 @@ class BatchNormModel(TFModelV2):
    def forward(self, input_dict, state, seq_lens):
        last_layer = input_dict["obs"]
        hiddens = [256, 256]
-        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
+        with tf1.variable_scope("model", reuse=tf1.AUTO_REUSE):
            for i, size in enumerate(hiddens):
-                last_layer = tf.layers.dense(
+                last_layer = tf1.layers.dense(
                    last_layer,
                    size,
                    kernel_initializer=normc_initializer(1.0),
                    activation=tf.nn.tanh,
                    name="fc{}".format(i))
                # Add a batch norm layer
-                last_layer = tf.layers.batch_normalization(
+                last_layer = tf1.layers.batch_normalization(
                    last_layer,
                    training=input_dict["is_training"],
                    name="bn_{}".format(i))

-            output = tf.layers.dense(
+            output = tf1.layers.dense(
                last_layer,
                self.num_outputs,
                kernel_initializer=normc_initializer(0.01),
                activation=None,
                name="out")
-            self._value_out = tf.layers.dense(
+            self._value_out = tf1.layers.dense(
                last_layer,
                1,
                kernel_initializer=normc_initializer(1.0),
@@ -67,8 +67,8 @@ class BatchNormModel(TFModelV2):
                name="vf")
        if not self._registered:
            self.register_variables(
-                tf.get_collection(
-                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
+                tf1.get_collection(
+                    tf1.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
            self._registered = True

        return output, []
@@ -9,7 +9,7 @@ from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -10,7 +10,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.offline import JsonReader

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -73,7 +73,7 @@ class DeprecatedCustomLossModelV1(Model):

    def _build_layers_v2(self, input_dict, num_outputs, options):
        self.obs_in = input_dict["obs"]
-        with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
+        with tf1.variable_scope("shared", reuse=tf1.AUTO_REUSE):
            self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
                                               self.action_space, num_outputs,
                                               options)
@@ -6,7 +6,7 @@ from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class EagerModel(TFModelV2):
@@ -34,7 +34,7 @@ class EagerModel(TFModelV2):

        def lambda_(x):
            eager_out = tf.py_function(self.forward_eager, [x], tf.float32)
-            with tf.control_dependencies([eager_out]):
+            with tf1.control_dependencies([eager_out]):
                eager_out.set_shape(x.shape)
                return eager_out

@@ -5,7 +5,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -25,11 +25,11 @@ class FastModel(TFModelV2):

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
-        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
-            bias = tf.get_variable(
+        with tf1.variable_scope("model", reuse=tf1.AUTO_REUSE):
+            bias = tf1.get_variable(
                dtype=tf.float32,
                name="bias",
-                initializer=tf.zeros_initializer,
+                initializer=tf.keras.initializers.Zeros(),
                shape=())
            output = bias + \
                tf.zeros([tf.shape(input_dict["obs"])[0], self.num_outputs])
@@ -37,8 +37,8 @@ class FastModel(TFModelV2):

        if not self._registered:
            self.register_variables(
-                tf.get_collection(
-                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
+                tf1.get_collection(
+                    tf1.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
            self._registered = True

        return output, []
@@ -7,7 +7,7 @@ from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -9,7 +9,7 @@ from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.numpy import LARGE_INTEGER

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -7,7 +7,7 @@ from ray.rllib.models.torch.recurrent_net import RecurrentNetwork as TorchRNN
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -8,13 +8,15 @@ from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class SpyLayer(tf.keras.layers.Layer):
    """A keras Layer, which intercepts its inputs and stored them as pickled.
    """

+    output = np.array(0, dtype=np.int64)
+
    def __init__(self, num_outputs, **kwargs):
        super().__init__(**kwargs)

@@ -26,7 +28,7 @@ class SpyLayer(tf.keras.layers.Layer):
        """

        del kwargs
-        spy_fn = tf.py_func(
+        spy_fn = tf1.py_func(
            self.spy,
            [
                inputs[0],  # observations
@@ -36,11 +38,11 @@ class SpyLayer(tf.keras.layers.Layer):
                inputs[5],  # h_out
                inputs[6],  # c_out
            ],
-            tf.int64,
+            tf.int64,  # Must match SpyLayer.output's type.
            stateful=True)

        # Compute outputs
-        with tf.control_dependencies([spy_fn]):
+        with tf1.control_dependencies([spy_fn]):
            return self.dense(inputs[1])

    @staticmethod
@@ -48,7 +50,8 @@ class SpyLayer(tf.keras.layers.Layer):
        """The actual spy operation: Store inputs in internal_kv."""

        if len(inputs) == 1:
-            return 0  # don't capture inference inputs
+            # don't capture inference inputs
+            return SpyLayer.output
        # TF runs this function in an isolated context, so we have to use
        # redis to communicate back to our suite
        ray.experimental.internal_kv._internal_kv_put(
@@ -61,7 +64,7 @@ class SpyLayer(tf.keras.layers.Layer):
            }),
            overwrite=True)
        RNNSpyModel.capture_index += 1
-        return 0
+        return SpyLayer.output


 class RNNSpyModel(RecurrentNetwork):
@@ -7,7 +7,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -15,7 +15,7 @@ class SharedWeightsModel1(TFModelV2):
    """Example of weight sharing between two different TFModelV2s.

    Here, we share the variables defined in the 'shared' variable scope
-    by entering it explicitly with tf.AUTO_REUSE. This creates the
+    by entering it explicitly with tf1.AUTO_REUSE. This creates the
    variables for the 'fc1' layer in a global scope called 'shared'
    (outside of the Policy's normal variable scope).
    """
@@ -26,9 +26,9 @@ class SharedWeightsModel1(TFModelV2):
                         model_config, name)

        inputs = tf.keras.layers.Input(observation_space.shape)
-        with tf.variable_scope(
-                tf.VariableScope(tf.AUTO_REUSE, "shared"),
-                reuse=tf.AUTO_REUSE,
+        with tf1.variable_scope(
+                tf1.VariableScope(tf1.AUTO_REUSE, "shared"),
+                reuse=tf1.AUTO_REUSE,
                auxiliary_name_scope=False):
            last_layer = tf.keras.layers.Dense(
                units=64, activation=tf.nn.relu, name="fc1")(inputs)
@@ -60,9 +60,9 @@ class SharedWeightsModel2(TFModelV2):
        inputs = tf.keras.layers.Input(observation_space.shape)

        # Weights shared with SharedWeightsModel1.
-        with tf.variable_scope(
-                tf.VariableScope(tf.AUTO_REUSE, "shared"),
-                reuse=tf.AUTO_REUSE,
+        with tf1.variable_scope(
+                tf1.VariableScope(tf1.AUTO_REUSE, "shared"),
+                reuse=tf1.AUTO_REUSE,
                auxiliary_name_scope=False):
            last_layer = tf.keras.layers.Dense(
                units=64, activation=tf.nn.relu, name="fc1")(inputs)
@@ -4,7 +4,7 @@ from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFCNet
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, nn = try_import_torch()


@@ -22,7 +22,7 @@ from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.test_utils import check_learning_achieved

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 parser = argparse.ArgumentParser()

@@ -20,7 +20,7 @@ from ray.rllib.examples.policy.rock_paper_scissors_dummies import \
 from ray.rllib.utils.framework import try_import_tf, try_import_torch
 from ray.rllib.utils.test_utils import check_learning_achieved

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()

 parser = argparse.ArgumentParser()
@@ -5,7 +5,7 @@ from ray.util.debug import log_once
 from ray.rllib.utils.debug import summarize
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 # Variable scope in which created variables will be placed under
 TOWER_SCOPE_NAME = "tower"
@@ -26,7 +26,7 @@ class LocalSyncParallelOptimizer:
    `load_data`, so you can make multiple passes (possibly in randomized order)
    over the same data once loaded.

-    This is similar to tf.train.SyncReplicasOptimizer, but works within a
+    This is similar to tf1.train.SyncReplicasOptimizer, but works within a
    single TensorFlow graph, i.e. implements in-graph replicated training:

      https://www.tensorflow.org/api_docs/python/tf/train/SyncReplicasOptimizer
@@ -63,21 +63,21 @@ class LocalSyncParallelOptimizer:
        self.build_graph = build_graph

        # First initialize the shared loss network
-        with tf.name_scope(TOWER_SCOPE_NAME):
+        with tf1.name_scope(TOWER_SCOPE_NAME):
            self._shared_loss = build_graph(self.loss_inputs)
-        shared_ops = tf.get_collection(
-            tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
+        shared_ops = tf1.get_collection(
+            tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)

        # Then setup the per-device loss graphs that use the shared weights
-        self._batch_index = tf.placeholder(tf.int32, name="batch_index")
+        self._batch_index = tf1.placeholder(tf.int32, name="batch_index")

        # Dynamic batch size, which may be shrunk if there isn't enough data
-        self._per_device_batch_size = tf.placeholder(
+        self._per_device_batch_size = tf1.placeholder(
            tf.int32, name="per_device_batch_size")
        self._loaded_per_device_batch_size = max_per_device_batch_size

        # When loading RNN input, we dynamically determine the max seq len
-        self._max_seq_len = tf.placeholder(tf.int32, name="max_seq_len")
+        self._max_seq_len = tf1.placeholder(tf.int32, name="max_seq_len")
        self._loaded_max_seq_len = 1

        # Split on the CPU in case the data doesn't fit in GPU memory.
@@ -103,15 +103,15 @@ class LocalSyncParallelOptimizer:
        # gather update ops for any batch norm layers. TODO(ekl) here we will
        # use all the ops found which won't work for DQN / DDPG, but those
        # aren't supported with multi-gpu right now anyways.
-        self._update_ops = tf.get_collection(
-            tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
+        self._update_ops = tf1.get_collection(
+            tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)
        for op in shared_ops:
            self._update_ops.remove(op)  # only care about tower update ops
        if self._update_ops:
            logger.debug("Update ops to run on apply gradient: {}".format(
                self._update_ops))

-        with tf.control_dependencies(self._update_ops):
+        with tf1.control_dependencies(self._update_ops):
            self._train_op = self.optimizer.apply_gradients(avg)

    def load_data(self, sess, inputs, state_inputs):
@@ -265,11 +265,11 @@ class LocalSyncParallelOptimizer:
    def _setup_device(self, device, device_input_placeholders, num_data_in):
        assert num_data_in <= len(device_input_placeholders)
        with tf.device(device):
-            with tf.name_scope(TOWER_SCOPE_NAME):
+            with tf1.name_scope(TOWER_SCOPE_NAME):
                device_input_batches = []
                device_input_slices = []
                for i, ph in enumerate(device_input_placeholders):
-                    current_batch = tf.Variable(
+                    current_batch = tf1.Variable(
                        ph,
                        trainable=False,
                        validate_shape=False,
@@ -13,7 +13,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.timer import TimerStat

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -84,14 +84,15 @@ class TFMultiGPULearner(LearnerThread):
        self.par_opt = []
        with self.local_worker.tf_sess.graph.as_default():
            with self.local_worker.tf_sess.as_default():
-                with tf.variable_scope(DEFAULT_POLICY_ID, reuse=tf.AUTO_REUSE):
+                with tf1.variable_scope(
+                        DEFAULT_POLICY_ID, reuse=tf1.AUTO_REUSE):
                    if self.policy._state_inputs:
                        rnn_inputs = self.policy._state_inputs + [
                            self.policy._seq_lens
                        ]
                    else:
                        rnn_inputs = []
-                    adam = tf.train.AdamOptimizer(self.lr)
+                    adam = tf1.train.AdamOptimizer(self.lr)
                    for _ in range(num_data_loader_buffers):
                        self.par_opt.append(
                            LocalSyncParallelOptimizer(
@@ -103,7 +104,7 @@ class TFMultiGPULearner(LearnerThread):
                                self.policy.copy))

                self.sess = self.local_worker.tf_sess
-                self.sess.run(tf.global_variables_initializer())
+                self.sess.run(tf1.global_variables_initializer())

        self.idle_optimizers = queue.Queue()
        self.ready_optimizers = queue.Queue()
@@ -20,7 +20,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.sgd import do_minibatch_sgd, averaged
 from ray.rllib.utils.types import PolicyID, SampleBatchType

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -137,7 +137,7 @@ class TrainTFMultiGPU:
            with self.workers.local_worker().tf_sess.as_default():
                for policy_id in self.policies:
                    policy = self.workers.local_worker().get_policy(policy_id)
-                    with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
+                    with tf1.variable_scope(policy_id, reuse=tf1.AUTO_REUSE):
                        if policy._state_inputs:
                            rnn_inputs = policy._state_inputs + [
                                policy._seq_lens
@@ -152,7 +152,7 @@ class TrainTFMultiGPU:
                                self.per_device_batch_size, policy.copy))

                self.sess = self.workers.local_worker().tf_sess
-                self.sess.run(tf.global_variables_initializer())
+                self.sess.run(tf1.global_variables_initializer())

    def __call__(self,
                 samples: SampleBatchType) -> (SampleBatchType, List[dict]):
@@ -27,7 +27,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.spaces.simplex import Simplex
 from ray.rllib.utils.spaces.space_utils import flatten_space

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 tree = try_import_tree()

 logger = logging.getLogger(__name__)
@@ -257,7 +257,7 @@ class ModelCatalog:

        dtype, shape = ModelCatalog.get_action_shape(action_space)

-        return tf.placeholder(dtype, shape=shape, name=name)
+        return tf1.placeholder(dtype, shape=shape, name=name)

    @staticmethod
    @DeveloperAPI
@@ -8,7 +8,7 @@ from ray.rllib.utils.annotations import PublicAPI, DeveloperAPI
 from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()

 logger = logging.getLogger(__name__)
@@ -38,13 +38,13 @@ class Model:
        self.action_space = action_space
        self.num_outputs = num_outputs
        self.options = options
-        self.scope = tf.get_variable_scope()
-        self.session = tf.get_default_session()
+        self.scope = tf1.get_variable_scope()
+        self.session = tf1.get_default_session()
        self.input_dict = input_dict
        if seq_lens is not None:
            self.seq_lens = seq_lens
        else:
-            self.seq_lens = tf.placeholder(
+            self.seq_lens = tf1.placeholder(
                dtype=tf.int32, shape=[None], name="seq_lens")

        self._num_outputs = num_outputs
@@ -68,10 +68,10 @@ class Model:
                input_dict["obs"], num_outputs, options)

        if options.get("free_log_std", False):
-            log_std = tf.get_variable(
+            log_std = tf1.get_variable(
                name="log_std",
                shape=[num_outputs],
-                initializer=tf.zeros_initializer)
+                initializer=tf1.zeros_initializer)
            self.outputs = tf.concat(
                [self.outputs, 0.0 * self.outputs + log_std], 1)

@@ -196,7 +196,7 @@ class Model:
 def flatten(obs, framework):
    """Flatten the given tensor."""
    if framework == "tf":
-        return tf.layers.flatten(obs)
+        return tf1.layers.flatten(obs)
    elif framework == "torch":
        assert torch is not None
        return torch.flatten(obs, start_dim=1)
@@ -13,7 +13,7 @@ from ray.rllib.utils.framework import try_import_tf, try_import_torch, \
 from ray.rllib.utils.spaces.repeated import Repeated
 from ray.rllib.utils.types import ModelConfigDict

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()


@@ -339,7 +339,7 @@ class NullContextManager:
 def flatten(obs, framework):
    """Flatten the given tensor."""
    if framework == "tf":
-        return tf.layers.flatten(obs)
+        return tf1.layers.flatten(obs)
    elif framework == "torch":
        assert torch is not None
        return torch.flatten(obs, start_dim=1)
@@ -13,7 +13,7 @@ from ray.rllib.utils.framework import try_import_torch, try_import_tf
 from ray.rllib.utils.test_utils import framework_iterator

 torch, nn = try_import_torch()
-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class TestModules(unittest.TestCase):
@@ -144,7 +144,7 @@ class TestModules(unittest.TestCase):
                model = TorchMultiHeadAttention(
                    in_dim=D_in, out_dim=D_out, num_heads=2, head_dim=32)

-                self.train_torch_layer(model, x, y)
+                self.train_torch_layer(model, x, y, num_epochs=500)

            else:  # framework is tensorflow or tensorflow-eager

@@ -165,7 +165,7 @@ class TestModules(unittest.TestCase):
            that it trains in a supervised setting."""

        # Checks that torch and tf embedding matrices are the same
-        with tf.Session().as_default() as sess:
+        with tf1.Session().as_default() as sess:
            assert np.allclose(
                relative_position_embedding(20, 15).eval(session=sess),
                relative_position_embedding_torch(20, 15).numpy())
@@ -16,7 +16,7 @@ from ray.rllib.utils.numpy import MIN_LOG_NN_OUTPUT, MAX_LOG_NN_OUTPUT, \
    softmax, SMALL_NUMBER, LARGE_INTEGER
 from ray.rllib.utils.test_utils import check, framework_iterator

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()
 tree = try_import_tree()

@@ -75,13 +75,13 @@ class TestDistributions(unittest.TestCase):
    def test_categorical(self):
        """Tests the Categorical ActionDistribution (tf only)."""
        num_samples = 100000
-        logits = tf.placeholder(tf.float32, shape=(None, 10))
+        logits = tf1.placeholder(tf.float32, shape=(None, 10))
        z = 8 * (np.random.rand(10) - 0.5)
        data = np.tile(z, (num_samples, 1))
        c = Categorical(logits, {})  # dummy config dict
        sample_op = c.sample()
-        sess = tf.Session()
-        sess.run(tf.global_variables_initializer())
+        sess = tf1.Session()
+        sess.run(tf1.global_variables_initializer())
        samples = sess.run(sample_op, feed_dict={logits: data})
        counts = np.zeros(10)
        for sample in samples:
@@ -17,7 +17,7 @@ from ray.rllib.models.tf.recurrent_net import RecurrentNetwork
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 # TODO(sven): Use RLlib's FCNet instead.
@@ -4,7 +4,7 @@ from ray.rllib.models.tf.misc import normc_initializer
 from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils.framework import get_activation_fn, try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class FullyConnectedNetwork(TFModelV2):
@@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import get_activation_fn, try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 # Deprecated: see as an alternative models/tf.fcnet.py
@@ -29,15 +29,15 @@ class FullyConnectedNetwork(Model):
        activation = get_activation_fn(options.get("fcnet_activation"))

        if len(inputs.shape) > 2:
-            inputs = tf.layers.flatten(inputs)
+            inputs = tf1.layers.flatten(inputs)

-        with tf.name_scope("fc_net"):
+        with tf1.name_scope("fc_net"):
            i = 1
            last_layer = inputs
            for size in hiddens:
                # skip final linear layer
                if options.get("no_final_linear") and i == len(hiddens):
-                    output = tf.layers.dense(
+                    output = tf1.layers.dense(
                        last_layer,
                        num_outputs,
                        kernel_initializer=normc_initializer(1.0),
@@ -46,7 +46,7 @@ class FullyConnectedNetwork(Model):
                    return output, output

                label = "fc{}".format(i)
-                last_layer = tf.layers.dense(
+                last_layer = tf1.layers.dense(
                    last_layer,
                    size,
                    kernel_initializer=normc_initializer(1.0),
@@ -54,7 +54,7 @@ class FullyConnectedNetwork(Model):
                    name=label)
                i += 1

-            output = tf.layers.dense(
+            output = tf1.layers.dense(
                last_layer,
                num_outputs,
                kernel_initializer=normc_initializer(0.01),
@@ -1,6 +1,6 @@
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class GRUGate(tf.keras.layers.Layer):
@@ -5,7 +5,7 @@
 """
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class MultiHeadAttention(tf.keras.layers.Layer):
@@ -3,7 +3,7 @@ import numpy as np
 from ray.rllib.utils.framework import get_activation_fn, get_variable, \
    try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class NoisyLayer(tf.keras.layers.Layer):
@@ -1,6 +1,6 @@
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class RelativeMultiHeadAttention(tf.keras.layers.Layer):
@@ -1,6 +1,6 @@
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class SkipConnection(tf.keras.layers.Layer):
@@ -7,7 +7,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 # Deprecated: see as an alternative models/tf/recurrent_net.py
@@ -45,7 +45,7 @@ class LSTM(Model):
        last_layer = add_time_dimension(features, self.seq_lens)

        # Setup the LSTM cell
-        lstm = tf.nn.rnn_cell.LSTMCell(cell_size, state_is_tuple=True)
+        lstm = tf1.nn.rnn_cell.LSTMCell(cell_size, state_is_tuple=True)
        self.state_init = [
            np.zeros(lstm.state_size.c, np.float32),
            np.zeros(lstm.state_size.h, np.float32)
@@ -55,15 +55,15 @@ class LSTM(Model):
        if self.state_in:
            c_in, h_in = self.state_in
        else:
-            c_in = tf.placeholder(
+            c_in = tf1.placeholder(
                tf.float32, [None, lstm.state_size.c], name="c")
-            h_in = tf.placeholder(
+            h_in = tf1.placeholder(
                tf.float32, [None, lstm.state_size.h], name="h")
            self.state_in = [c_in, h_in]

        # Setup LSTM outputs
-        state_in = tf.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
-        lstm_out, lstm_state = tf.nn.dynamic_rnn(
+        state_in = tf1.nn.rnn_cell.LSTMStateTuple(c_in, h_in)
+        lstm_out, lstm_state = tf1.nn.dynamic_rnn(
            lstm,
            last_layer,
            initial_state=state_in,
@@ -1,7 +1,7 @@
 import numpy as np
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 def normc_initializer(std=1.0):
@@ -24,7 +24,7 @@ def conv2d(x,
    if dtype is None:
        dtype = tf.float32

-    with tf.variable_scope(name):
+    with tf1.variable_scope(name):
        stride_shape = [1, stride[0], stride[1], 1]
        filter_shape = [
            filter_size[0], filter_size[1],
@@ -40,24 +40,24 @@ def conv2d(x,
        # Initialize weights with random weights.
        w_bound = np.sqrt(6 / (fan_in + fan_out))

-        w = tf.get_variable(
+        w = tf1.get_variable(
            "W",
            filter_shape,
            dtype,
-            tf.random_uniform_initializer(-w_bound, w_bound),
+            tf1.random_uniform_initializer(-w_bound, w_bound),
            collections=collections)
-        b = tf.get_variable(
+        b = tf1.get_variable(
            "b", [1, 1, 1, num_filters],
-            initializer=tf.constant_initializer(0.0),
+            initializer=tf1.constant_initializer(0.0),
            collections=collections)
-        return tf.nn.conv2d(x, w, stride_shape, pad) + b
+        return tf1.nn.conv2d(x, w, stride_shape, pad) + b


 def linear(x, size, name, initializer=None, bias_init=0):
-    w = tf.get_variable(
+    w = tf1.get_variable(
        name + "/w", [x.get_shape()[1], size], initializer=initializer)
-    b = tf.get_variable(
-        name + "/b", [size], initializer=tf.constant_initializer(bias_init))
+    b = tf1.get_variable(
+        name + "/b", [size], initializer=tf1.constant_initializer(bias_init))
    return tf.matmul(x, w) + b


@@ -9,7 +9,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tf_ops import scope_vars

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -47,7 +47,7 @@ def make_v1_wrapper(legacy_model_cls):
            # Tracks update ops
            self._update_ops = None

-            with tf.variable_scope(self.name) as scope:
+            with tf1.variable_scope(self.name) as scope:
                self.variable_scope = scope

        @override(ModelV2)
@@ -58,20 +58,20 @@ def make_v1_wrapper(legacy_model_cls):
        def __call__(self, input_dict, state, seq_lens):
            if self.cur_instance:
                # create a weight-sharing model copy
-                with tf.variable_scope(self.cur_instance.scope, reuse=True):
+                with tf1.variable_scope(self.cur_instance.scope, reuse=True):
                    new_instance = self.legacy_model_cls(
                        input_dict, self.obs_space, self.action_space,
                        self.num_outputs, self.model_config, state, seq_lens)
            else:
                # create a new model instance
-                with tf.variable_scope(self.name):
+                with tf1.variable_scope(self.name):
                    prev_update_ops = set(
-                        tf.get_collection(tf.GraphKeys.UPDATE_OPS))
+                        tf1.get_collection(tf1.GraphKeys.UPDATE_OPS))
                    new_instance = self.legacy_model_cls(
                        input_dict, self.obs_space, self.action_space,
                        self.num_outputs, self.model_config, state, seq_lens)
                    self._update_ops = list(
-                        set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) -
+                        set(tf1.get_collection(tf1.GraphKeys.UPDATE_OPS)) -
                        prev_update_ops)
            if len(new_instance.state_init) != len(self.get_initial_state()):
                raise ValueError(
@@ -112,8 +112,9 @@ def make_v1_wrapper(legacy_model_cls):
        def value_function(self):
            assert self.cur_instance is not None, "must call forward first"

-            with tf.variable_scope(self.variable_scope):
-                with tf.variable_scope("value_function", reuse=tf.AUTO_REUSE):
+            with tf1.variable_scope(self.variable_scope):
+                with tf1.variable_scope(
+                        "value_function", reuse=tf1.AUTO_REUSE):
                    # Simple case: sharing the feature layer
                    if self.model_config["vf_share_layers"]:
                        return tf.reshape(
@@ -7,7 +7,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


@DeveloperAPI
@@ -160,18 +160,17 @@ class LSTMWrapper(RecurrentNetwork):

        # Concat. prev-action/reward if required.
        if self.model_config["lstm_use_prev_action_reward"]:
-            if self.model_config["lstm_use_prev_action_reward"]:
-                wrapped_out = tf.concat(
-                    [
-                        wrapped_out,
-                        tf.reshape(
-                            tf.cast(input_dict[SampleBatch.PREV_ACTIONS],
-                                    tf.float32), [-1, self.action_dim]),
-                        tf.reshape(
-                            tf.cast(input_dict[SampleBatch.PREV_REWARDS],
-                                    tf.float32), [-1, 1]),
-                    ],
-                    axis=1)
+            wrapped_out = tf.concat(
+                [
+                    wrapped_out,
+                    tf.reshape(
+                        tf.cast(input_dict[SampleBatch.PREV_ACTIONS],
+                                tf.float32), [-1, self.action_dim]),
+                    tf.reshape(
+                        tf.cast(input_dict[SampleBatch.PREV_REWARDS],
+                                tf.float32), [-1, 1]),
+                ],
+                axis=1)

        # Then through our LSTM.
        input_dict["obs_flat"] = wrapped_out
@@ -9,7 +9,7 @@ from ray.rllib.utils.annotations import override, DeveloperAPI
 from ray.rllib.utils.framework import try_import_tf, try_import_tfp
 from ray.rllib.utils.spaces.space_utils import get_base_struct_from_space

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 tfp = try_import_tfp()
 tree = try_import_tree()

@@ -85,7 +85,7 @@ class Categorical(TFActionDistribution):

    @override(TFActionDistribution)
    def _build_sample_op(self):
-        return tf.squeeze(tf.multinomial(self.inputs, 1), axis=1)
+        return tf.squeeze(tf.random.categorical(self.inputs, 1), axis=1)

    @staticmethod
    @override(ActionDistribution)
@@ -2,7 +2,7 @@ from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.utils.annotations import override, PublicAPI
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


@PublicAPI
@@ -39,10 +39,10 @@ class TFModelV2(ModelV2):
            name,
            framework="tf")
        self.var_list = []
-        if tf.executing_eagerly():
+        if tf1.executing_eagerly():
            self.graph = None
        else:
-            self.graph = tf.get_default_graph()
+            self.graph = tf1.get_default_graph()

    def context(self):
        """Returns a contextmanager for the current TF graph."""
@@ -3,7 +3,7 @@ from ray.rllib.models.tf.visionnet_v1 import _get_filter_config
 from ray.rllib.models.tf.misc import normc_initializer
 from ray.rllib.utils.framework import get_activation_fn, try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class VisionNetwork(TFModelV2):
@@ -4,7 +4,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.deprecation import deprecation_warning
 from ray.rllib.utils.framework import get_activation_fn, try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 # Deprecated: see as an alternative models/tf.visionnet.py
@@ -24,9 +24,9 @@ class VisionNetwork(Model):

        activation = get_activation_fn(options.get("conv_activation"))

-        with tf.name_scope("vision_net"):
+        with tf1.name_scope("vision_net"):
            for i, (out_size, kernel, stride) in enumerate(filters[:-1], 1):
-                inputs = tf.layers.conv2d(
+                inputs = tf1.layers.conv2d(
                    inputs,
                    out_size,
                    kernel,
@@ -38,7 +38,7 @@ class VisionNetwork(Model):

            # skip final linear layer
            if options.get("no_final_linear"):
-                fc_out = tf.layers.conv2d(
+                fc_out = tf1.layers.conv2d(
                    inputs,
                    num_outputs,
                    kernel,
@@ -48,7 +48,7 @@ class VisionNetwork(Model):
                    name="fc_out")
                return flatten(fc_out), flatten(fc_out)

-            fc1 = tf.layers.conv2d(
+            fc1 = tf1.layers.conv2d(
                inputs,
                out_size,
                kernel,
@@ -56,7 +56,7 @@ class VisionNetwork(Model):
                activation=activation,
                padding="valid",
                name="fc1")
-            fc2 = tf.layers.conv2d(
+            fc2 = tf1.layers.conv2d(
                fc1,
                num_outputs, [1, 1],
                activation=None,
@@ -6,7 +6,7 @@ from ray.rllib.policy.sample_batch import MultiAgentBatch
 from ray.rllib.utils.annotations import PublicAPI
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -75,7 +75,7 @@ class InputReader:
            k: (-1, ) + s[1:]
            for (k, s) in [(k, batch[k].shape) for k in keys]
        }
-        queue = tf.FIFOQueue(capacity=queue_size, dtypes=dtypes, names=keys)
+        queue = tf1.FIFOQueue(capacity=queue_size, dtypes=dtypes, names=keys)
        tensors = queue.dequeue()

        logger.info("Creating TF queue runner for {}".format(self))
@@ -92,12 +92,12 @@ class _QueueRunner(threading.Thread):

    def __init__(self, input_reader, queue, keys, dtypes):
        threading.Thread.__init__(self)
-        self.sess = tf.get_default_session()
+        self.sess = tf1.get_default_session()
        self.daemon = True
        self.input_reader = input_reader
        self.keys = keys
        self.queue = queue
-        self.placeholders = [tf.placeholder(dtype) for dtype in dtypes]
+        self.placeholders = [tf1.placeholder(dtype) for dtype in dtypes]
        self.enqueue_op = queue.enqueue(dict(zip(keys, self.placeholders)))

    def enqueue(self, batch):
@@ -45,7 +45,7 @@ class JsonReader(InputReader):
                logger.warning(
                    "Treating input directory as glob pattern: {}".format(
                        inputs))
-            if urlparse(inputs).scheme not in ["d", ""]:
+            if urlparse(inputs).scheme not in ["", "c"]:
                raise ValueError(
                    "Don't know how to glob over `{}`, ".format(inputs) +
                    "please specify a list of files to read instead.")
@@ -123,7 +123,7 @@ class JsonReader(InputReader):

    def _next_file(self):
        path = random.choice(self.files)
-        if urlparse(path).scheme:
+        if urlparse(path).scheme not in ["", "c"]:
            if smart_open is None:
                raise ValueError(
                    "You must install the `smart_open` module to read "
@@ -42,7 +42,7 @@ class JsonWriter(OutputWriter):
        self.ioctx = ioctx or IOContext()
        self.max_file_size = max_file_size
        self.compress_columns = compress_columns
-        if urlparse(path).scheme:
+        if urlparse(path).scheme not in ["", "c"]:
            self.path_is_uri = True
        else:
            path = os.path.abspath(os.path.expanduser(path))
@@ -15,7 +15,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.timer import TimerStat

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -86,14 +86,15 @@ class TFMultiGPULearner(LearnerThread):
        self.par_opt = []
        with self.local_worker.tf_sess.graph.as_default():
            with self.local_worker.tf_sess.as_default():
-                with tf.variable_scope(DEFAULT_POLICY_ID, reuse=tf.AUTO_REUSE):
+                with tf1.variable_scope(
+                        DEFAULT_POLICY_ID, reuse=tf1.AUTO_REUSE):
                    if self.policy._state_inputs:
                        rnn_inputs = self.policy._state_inputs + [
                            self.policy._seq_lens
                        ]
                    else:
                        rnn_inputs = []
-                    adam = tf.train.AdamOptimizer(self.lr)
+                    adam = tf1.train.AdamOptimizer(self.lr)
                    for _ in range(num_data_loader_buffers):
                        self.par_opt.append(
                            LocalSyncParallelOptimizer(
@@ -105,7 +106,7 @@ class TFMultiGPULearner(LearnerThread):
                                self.policy.copy))

                self.sess = self.local_worker.tf_sess
-                self.sess.run(tf.global_variables_initializer())
+                self.sess.run(tf1.global_variables_initializer())

        self.idle_optimizers = queue.Queue()
        self.ready_optimizers = queue.Queue()
@@ -5,7 +5,7 @@ from ray.util.debug import log_once
 from ray.rllib.utils.debug import summarize
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 # Variable scope in which created variables will be placed under
 TOWER_SCOPE_NAME = "tower"
@@ -63,21 +63,21 @@ class LocalSyncParallelOptimizer:
        self.build_graph = build_graph

        # First initialize the shared loss network
-        with tf.name_scope(TOWER_SCOPE_NAME):
+        with tf1.name_scope(TOWER_SCOPE_NAME):
            self._shared_loss = build_graph(self.loss_inputs)
-        shared_ops = tf.get_collection(
-            tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
+        shared_ops = tf1.get_collection(
+            tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)

        # Then setup the per-device loss graphs that use the shared weights
-        self._batch_index = tf.placeholder(tf.int32, name="batch_index")
+        self._batch_index = tf1.placeholder(tf.int32, name="batch_index")

        # Dynamic batch size, which may be shrunk if there isn't enough data
-        self._per_device_batch_size = tf.placeholder(
+        self._per_device_batch_size = tf1.placeholder(
            tf.int32, name="per_device_batch_size")
        self._loaded_per_device_batch_size = max_per_device_batch_size

        # When loading RNN input, we dynamically determine the max seq len
-        self._max_seq_len = tf.placeholder(tf.int32, name="max_seq_len")
+        self._max_seq_len = tf1.placeholder(tf.int32, name="max_seq_len")
        self._loaded_max_seq_len = 1

        # Split on the CPU in case the data doesn't fit in GPU memory.
@@ -103,15 +103,15 @@ class LocalSyncParallelOptimizer:
        # gather update ops for any batch norm layers. TODO(ekl) here we will
        # use all the ops found which won't work for DQN / DDPG, but those
        # aren't supported with multi-gpu right now anyways.
-        self._update_ops = tf.get_collection(
-            tf.GraphKeys.UPDATE_OPS, scope=tf.get_variable_scope().name)
+        self._update_ops = tf1.get_collection(
+            tf1.GraphKeys.UPDATE_OPS, scope=tf1.get_variable_scope().name)
        for op in shared_ops:
            self._update_ops.remove(op)  # only care about tower update ops
        if self._update_ops:
            logger.debug("Update ops to run on apply gradient: {}".format(
                self._update_ops))

-        with tf.control_dependencies(self._update_ops):
+        with tf1.control_dependencies(self._update_ops):
            self._train_op = self.optimizer.apply_gradients(avg)

    def load_data(self, sess, inputs, state_inputs):
@@ -265,11 +265,11 @@ class LocalSyncParallelOptimizer:
    def _setup_device(self, device, device_input_placeholders, num_data_in):
        assert num_data_in <= len(device_input_placeholders)
        with tf.device(device):
-            with tf.name_scope(TOWER_SCOPE_NAME):
+            with tf1.name_scope(TOWER_SCOPE_NAME):
                device_input_batches = []
                device_input_slices = []
                for i, ph in enumerate(device_input_placeholders):
-                    current_batch = tf.Variable(
+                    current_batch = tf1.Variable(
                        ph,
                        trainable=False,
                        validate_shape=False,
@@ -16,7 +16,7 @@ from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.sgd import averaged
 from ray.rllib.utils.timer import TimerStat

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -115,7 +115,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
        with self.workers.local_worker().tf_sess.graph.as_default():
            with self.workers.local_worker().tf_sess.as_default():
                for policy_id, policy in self.policies.items():
-                    with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE):
+                    with tf1.variable_scope(policy_id, reuse=tf1.AUTO_REUSE):
                        if policy._state_inputs:
                            rnn_inputs = policy._state_inputs + [
                                policy._seq_lens
@@ -130,7 +130,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer):
                                self.per_device_batch_size, policy.copy))

                self.sess = self.workers.local_worker().tf_sess
-                self.sess.run(tf.global_variables_initializer())
+                self.sess.run(tf1.global_variables_initializer())

    @override(PolicyOptimizer)
    def step(self):
@@ -14,7 +14,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.tests.mock_worker import _MockWorker
 from ray.rllib.utils.framework import try_import_tf

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 class LRScheduleTest(unittest.TestCase):
@@ -250,7 +250,7 @@ class AsyncSamplesOptimizerTest(unittest.TestCase):

    def _make_envs(self):
        def make_sess():
-            return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2}))
+            return tf1.Session(config=tf1.ConfigProto(device_count={"CPU": 2}))

        local = RolloutWorker(
            env_creator=lambda _: gym.make("CartPole-v0"),
@@ -14,7 +14,7 @@ from ray.rllib.utils.debug import summarize
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.tracking_dict import UsageTrackingDict

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()

 logger = logging.getLogger(__name__)

@@ -116,7 +116,7 @@ class DynamicTFPolicy(TFPolicy):
            explore = existing_inputs["is_exploring"]
            timestep = existing_inputs["timestep"]
        else:
-            obs = tf.placeholder(
+            obs = tf1.placeholder(
                tf.float32,
                shape=[None] + list(obs_space.shape),
                name="observation")
@@ -124,11 +124,11 @@ class DynamicTFPolicy(TFPolicy):
            if self._obs_include_prev_action_reward:
                prev_actions = ModelCatalog.get_action_placeholder(
                    action_space, "prev_action")
-                prev_rewards = tf.placeholder(
+                prev_rewards = tf1.placeholder(
                    tf.float32, [None], name="prev_reward")
-            explore = tf.placeholder_with_default(
+            explore = tf1.placeholder_with_default(
                True, (), name="is_exploring")
-            timestep = tf.placeholder(tf.int32, (), name="timestep")
+            timestep = tf1.placeholder(tf.int32, (), name="timestep")

        self._input_dict = {
            SampleBatch.CUR_OBS: obs,
@@ -137,7 +137,7 @@ class DynamicTFPolicy(TFPolicy):
            "is_training": self._get_is_training_placeholder(),
        }
        # Placeholder for RNN time-chunk valid lengths.
-        self._seq_lens = tf.placeholder(
+        self._seq_lens = tf1.placeholder(
            dtype=tf.int32, shape=[None], name="seq_lens")

        dist_class = dist_inputs = None
@@ -176,7 +176,7 @@ class DynamicTFPolicy(TFPolicy):
                self._seq_lens = existing_inputs["seq_lens"]
        else:
            self._state_in = [
-                tf.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
+                tf1.placeholder(shape=(None, ) + s.shape, dtype=s.dtype)
                for s in self.model.get_initial_state()
            ]

@@ -223,7 +223,7 @@ class DynamicTFPolicy(TFPolicy):
                    explore=explore)

        # Phase 1 init.
-        sess = tf.get_default_session() or tf.Session()
+        sess = tf1.get_default_session() or tf1.Session()
        if get_batch_divisibility_req:
            batch_divisibility_req = get_batch_divisibility_req(self)
        else:
@@ -343,7 +343,7 @@ class DynamicTFPolicy(TFPolicy):
            dummy_batch[k] = fake_array(v)

        # postprocessing might depend on variable init, so run it first here
-        self._sess.run(tf.global_variables_initializer())
+        self._sess.run(tf1.global_variables_initializer())

        postprocessed_batch = self.postprocess_trajectory(
            SampleBatch(dummy_batch))
@@ -380,7 +380,7 @@ class DynamicTFPolicy(TFPolicy):
                continue
            shape = (None, ) + v.shape[1:]
            dtype = np.float32 if v.dtype == np.float64 else v.dtype
-            placeholder = tf.placeholder(dtype, shape=shape, name=k)
+            placeholder = tf1.placeholder(dtype, shape=shape, name=k)
            train_batch[k] = placeholder

        for i, si in enumerate(self._state_in):
@@ -402,7 +402,7 @@ class DynamicTFPolicy(TFPolicy):
        if self._grad_stats_fn:
            self._stats_fetches.update(
                self._grad_stats_fn(self, train_batch, self._grads))
-        self._sess.run(tf.global_variables_initializer())
+        self._sess.run(tf1.global_variables_initializer())

    def _do_loss_init(self, train_batch):
        loss = self._loss_fn(self, self.model, self.dist_class, train_batch)
@@ -16,7 +16,7 @@ from ray.rllib.utils.annotations import override
 from ray.rllib.utils.framework import try_import_tf
 from ray.rllib.utils.spaces.space_utils import flatten_to_single_ndarray

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 logger = logging.getLogger(__name__)


@@ -239,7 +239,7 @@ def build_eager_tf_policy(name,
                )
            self.exploration = self._create_exploration()
            self._state_in = [
-                tf.convert_to_tensor(np.array([s]))
+                tf.convert_to_tensor([s])
                for s in self.model.get_initial_state()
            ]
            input_dict = {
@@ -266,7 +266,7 @@ def build_eager_tf_policy(name,
            if optimizer_fn:
                self._optimizer = optimizer_fn(self, config)
            else:
-                self._optimizer = tf.train.AdamOptimizer(config["lr"])
+                self._optimizer = tf1.train.AdamOptimizer(config["lr"])

            if after_init:
                after_init(self, observation_space, action_space, config)
@@ -618,8 +618,7 @@ def build_eager_tf_policy(name,
                SampleBatch.DONES: np.array([False], dtype=np.bool),
                SampleBatch.REWARDS: np.array([0], dtype=np.float32),
            }
-            if isinstance(self.action_space, Tuple) or isinstance(
-                    self.action_space, Dict):
+            if isinstance(self.action_space, (Dict, Tuple)):
                dummy_batch[SampleBatch.ACTIONS] = [
                    flatten_to_single_ndarray(self.action_space.sample())
                ]
@@ -640,7 +639,7 @@ def build_eager_tf_policy(name,
                dummy_batch["seq_lens"] = np.array([1], dtype=np.int32)

            # Convert everything to tensors.
-            dummy_batch = tf.nest.map_structure(tf.convert_to_tensor,
+            dummy_batch = tf.nest.map_structure(tf1.convert_to_tensor,
                                                dummy_batch)

            # for IMPALA which expects a certain sample batch size.
@@ -20,7 +20,7 @@ from ray.rllib.utils.annotations import DeveloperAPI
 from ray.rllib.utils.debug import summarize
 from ray.rllib.utils.framework import try_import_tf, try_import_torch

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()
 torch, _ = try_import_torch()

 logger = logging.getLogger(__name__)
@@ -203,7 +203,7 @@ def chop_into_sequences(episode_ids,
    seq_len = 0
    unique_ids = np.add(
        np.add(episode_ids, agent_indices),
-        np.array(unroll_ids) << 32)
+        np.array(unroll_ids, dtype=np.int64) << 32)
    for uid in unique_ids:
        if (prev_id is not None and uid != prev_id) or \
                seq_len >= max_seq_len:
@@ -11,7 +11,7 @@ from ray.rllib.utils.test_utils import check, framework_iterator
 from ray.rllib.utils.numpy import one_hot, fc, MIN_LOG_NN_OUTPUT, \
    MAX_LOG_NN_OUTPUT

-tf = try_import_tf()
+tf1, tf, tfv = try_import_tf()


 def do_test_log_likelihood(run,
--- a/Show More
+++ b/Show More