From f1b56fa5eead0468f8e31f13e69b8cc4715c795d Mon Sep 17 00:00:00 2001
From: Sven <svenmika1977@gmail.com>
Date: Thu, 2 Jan 2020 19:08:03 -0500
Subject: [PATCH] PG unify/cleanup tf vs torch and PG functionality test cases
 (tf + torch). (#6650)

* Unifying the code for PGTrainer/Policy wrt tf vs torch.
Adding loss function test cases for the PGAgent (confirm equivalence of tf and torch).

* Fix LINT line-len errors.

* Fix LINT errors.

* Fix `tf_pg_policy` imports (formerly: `pg_policy`).

* Rename tf_pg_... into pg_tf_... following <alg>_<framework>_... convention, where ...=policy/loss/agent/trainer.
Retire `PGAgent` class (use PGTrainer instead).

* - Move PG test into agents/pg/tests directory.
- All test cases will be located near the classes that are tested and
  then built into the Bazel/Travis test suite.

* Moved post_process_advantages into pg.py (from pg_tf_policy.py), b/c
the function is not a tf-specific one.

* Fix remaining import errors for agents/pg/...

* Fix circular dependency in pg imports.

* Add pg tests to Jenkins test suite.
---
 ci/jenkins_tests/run_rllib_tests.sh           |   3 +
 python/ray/tune/trainable.py                  |   7 +-
 rllib/agents/a3c/a3c.py                       |   2 +-
 rllib/agents/pg/__init__.py                   |   9 +-
 rllib/agents/pg/pg.py                         |  16 +--
 rllib/agents/pg/pg_policy.py                  |  37 ------
 rllib/agents/pg/pg_tf_policy.py               |  30 +++++
 rllib/agents/pg/pg_torch_policy.py            |  35 ++++++
 rllib/agents/pg/tests/__init__.py             |   0
 rllib/agents/pg/tests/test_pg.py              | 108 ++++++++++++++++++
 rllib/agents/pg/torch_pg_policy.py            |  40 -------
 rllib/evaluation/rollout_worker.py            |  12 +-
 .../rock_paper_scissors_multiagent.py         |   2 +-
 rllib/optimizers/async_samples_optimizer.py   |   1 -
 rllib/optimizers/segment_tree.py              |   1 -
 rllib/tests/agents/__init__.py                |   3 +
 rllib/tests/agents/functionality/__init__.py  |   3 +
 rllib/tests/test_external_multi_agent_env.py  |   2 +-
 rllib/tests/test_io.py                        |   2 +-
 rllib/tests/test_multi_agent_env.py           |   2 +-
 rllib/tests/test_nested_spaces.py             |   2 +-
 21 files changed, 215 insertions(+), 102 deletions(-)
 delete mode 100644 rllib/agents/pg/pg_policy.py
 create mode 100644 rllib/agents/pg/pg_tf_policy.py
 create mode 100644 rllib/agents/pg/pg_torch_policy.py
 create mode 100644 rllib/agents/pg/tests/__init__.py
 create mode 100644 rllib/agents/pg/tests/test_pg.py
 delete mode 100644 rllib/agents/pg/torch_pg_policy.py
 create mode 100644 rllib/tests/agents/__init__.py
 create mode 100644 rllib/tests/agents/functionality/__init__.py
diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh
index 593a4f6c2..f86ec51c2 100755
--- a/ci/jenkins_tests/run_rllib_tests.sh
+++ b/ci/jenkins_tests/run_rllib_tests.sh
@@ -165,6 +165,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     --stop '{"training_iteration": 1}' \
     --config '{"num_workers": 2}'
 
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/agents/pg/tests/test_pg.py
+
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
     /ray/ci/suppress_output /ray/rllib/train.py \
     --env CartPole-v0 \
diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py
index 9e8c830a0..98d529945 100644
--- a/python/ray/tune/trainable.py
+++ b/python/ray/tune/trainable.py
@@ -124,8 +124,13 @@ class Trainable(object):
 
     @classmethod
     def resource_help(cls, config):
-        """Returns a help string for configuring this trainable's resources."""
+        """
+        Args:
+            config (dict): The Trainer's config dict.
 
+        Returns:
+             str: A help string for configuring this trainable's resources.
+        """
         return ""
 
     def current_ip(self):
diff --git a/rllib/agents/a3c/a3c.py b/rllib/agents/a3c/a3c.py
index d320b9636..9809ef464 100644
--- a/rllib/agents/a3c/a3c.py
+++ b/rllib/agents/a3c/a3c.py
@@ -12,7 +12,7 @@ from ray.rllib.optimizers import AsyncGradientsOptimizer
 DEFAULT_CONFIG = with_common_config({
     # Size of rollout batch
     "sample_batch_size": 10,
-    # Use PyTorch as backend - no LSTM support
+    # Use PyTorch as framework - no LSTM support
     "use_pytorch": False,
     # GAE(gamma) parameter
     "lambda": 1.0,
diff --git a/rllib/agents/pg/__init__.py b/rllib/agents/pg/__init__.py
index eb11c99bf..ae4a55a81 100644
--- a/rllib/agents/pg/__init__.py
+++ b/rllib/agents/pg/__init__.py
@@ -1,6 +1,7 @@
 from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG
-from ray.rllib.utils import renamed_agent
+from ray.rllib.agents.pg.pg_tf_policy import pg_tf_loss, \
+    post_process_advantages
+from ray.rllib.agents.pg.pg_torch_policy import pg_torch_loss
 
-PGAgent = renamed_agent(PGTrainer)
-
-__all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"]
+__all__ = ["PGTrainer", "pg_tf_loss", "pg_torch_loss",
+           "post_process_advantages", "DEFAULT_CONFIG"]
diff --git a/rllib/agents/pg/pg.py b/rllib/agents/pg/pg.py
index 71e2ab3fb..05bdba7e9 100644
--- a/rllib/agents/pg/pg.py
+++ b/rllib/agents/pg/pg.py
@@ -1,20 +1,16 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 from ray.rllib.agents.trainer import with_common_config
 from ray.rllib.agents.trainer_template import build_trainer
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 
 # yapf: disable
 # __sphinx_doc_begin__
 DEFAULT_CONFIG = with_common_config({
-    # No remote workers by default
+    # No remote workers by default.
     "num_workers": 0,
-    # Learning rate
+    # Learning rate.
     "lr": 0.0004,
-    # Use PyTorch as backend
-    "use_pytorch": False,
+    # Use PyTorch as framework?
+    "use_pytorch": False
 })
 # __sphinx_doc_end__
 # yapf: enable
@@ -22,7 +18,7 @@ DEFAULT_CONFIG = with_common_config({
 
 def get_policy_class(config):
     if config["use_pytorch"]:
-        from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy
+        from ray.rllib.agents.pg.pg_torch_policy import PGTorchPolicy
         return PGTorchPolicy
     else:
         return PGTFPolicy
diff --git a/rllib/agents/pg/pg_policy.py b/rllib/agents/pg/pg_policy.py
deleted file mode 100644
index 1b8f6a4b6..000000000
--- a/rllib/agents/pg/pg_policy.py
+++ /dev/null
@@ -1,37 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-# The basic policy gradients loss
-def policy_gradient_loss(policy, model, dist_class, train_batch):
-    logits, _ = model.from_batch(train_batch)
-    action_dist = dist_class(logits, model)
-    return -tf.reduce_mean(
-        action_dist.logp(train_batch[SampleBatch.ACTIONS]) *
-        train_batch[Postprocessing.ADVANTAGES])
-
-
-# This adds the "advantages" column to the sampletrain_batch.
-def postprocess_advantages(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    return compute_advantages(
-        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
-
-
-PGTFPolicy = build_tf_policy(
-    name="PGTFPolicy",
-    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
-    postprocess_fn=postprocess_advantages,
-    loss_fn=policy_gradient_loss)
diff --git a/rllib/agents/pg/pg_tf_policy.py b/rllib/agents/pg/pg_tf_policy.py
new file mode 100644
index 000000000..262836db5
--- /dev/null
+++ b/rllib/agents/pg/pg_tf_policy.py
@@ -0,0 +1,30 @@
+import ray
+from ray.rllib.evaluation.postprocessing import Postprocessing, \
+    compute_advantages
+from ray.rllib.policy.tf_policy_template import build_tf_policy
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
+
+
+def post_process_advantages(policy, sample_batch, other_agent_batches=None,
+                            episode=None):
+    """This adds the "advantages" column to the sample train_batch."""
+    return compute_advantages(sample_batch, 0.0, policy.config["gamma"],
+                              use_gae=False)
+
+
+def pg_tf_loss(policy, model, dist_class, train_batch):
+    """The basic policy gradients loss."""
+    logits, _ = model.from_batch(train_batch)
+    action_dist = dist_class(logits, model)
+    return -tf.reduce_mean(action_dist.logp(train_batch[SampleBatch.ACTIONS])
+                           * train_batch[Postprocessing.ADVANTAGES])
+
+
+PGTFPolicy = build_tf_policy(
+    name="PGTFPolicy",
+    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
+    postprocess_fn=post_process_advantages,
+    loss_fn=pg_tf_loss)
diff --git a/rllib/agents/pg/pg_torch_policy.py b/rllib/agents/pg/pg_torch_policy.py
new file mode 100644
index 000000000..4f6e8f405
--- /dev/null
+++ b/rllib/agents/pg/pg_torch_policy.py
@@ -0,0 +1,35 @@
+import ray
+from ray.rllib.agents.pg.pg_tf_policy import post_process_advantages
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.torch_policy_template import build_torch_policy
+from ray.rllib.utils.framework import try_import_torch
+
+torch, _ = try_import_torch()
+
+
+def pg_torch_loss(policy, model, dist_class, train_batch):
+    """The basic policy gradients loss."""
+    logits, _ = model.from_batch(train_batch)
+    action_dist = dist_class(logits, model)
+    log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
+    # Save the error in the policy object.
+    # policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot(
+    # log_probs.reshape(-1)) / len(log_probs)
+    policy.pi_err = -torch.mean(
+        log_probs * train_batch[Postprocessing.ADVANTAGES]
+    )
+    return policy.pi_err
+
+
+def pg_loss_stats(policy, train_batch):
+    """ The error is recorded when computing the loss."""
+    return {"policy_loss": policy.pi_err.item()}
+
+
+PGTorchPolicy = build_torch_policy(
+    name="PGTorchPolicy",
+    get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG,
+    loss_fn=pg_torch_loss,
+    stats_fn=pg_loss_stats,
+    postprocess_fn=post_process_advantages)
diff --git a/rllib/agents/pg/tests/__init__.py b/rllib/agents/pg/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/rllib/agents/pg/tests/test_pg.py b/rllib/agents/pg/tests/test_pg.py
new file mode 100644
index 000000000..0971843a2
--- /dev/null
+++ b/rllib/agents/pg/tests/test_pg.py
@@ -0,0 +1,108 @@
+import numpy as np
+import unittest
+
+import ray
+import ray.rllib.agents.pg as pg
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils import check, fc
+
+
+class TestPG(unittest.TestCase):
+
+    ray.init()
+
+    def test_pg_compilation(self):
+        """Test whether a PGTrainer can be built with both frameworks."""
+        config = pg.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+
+        # tf.
+        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
+
+        num_iterations = 2
+        for i in range(num_iterations):
+            trainer.train()
+
+        # Torch.
+        config["use_pytorch"] = True
+        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
+        for i in range(num_iterations):
+            trainer.train()
+
+    def test_pg_loss_functions(self):
+        """Tests the PG loss function math."""
+        config = pg.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+        config["eager"] = True
+        config["gamma"] = 0.99
+        config["model"]["fcnet_hiddens"] = [10]
+        config["model"]["fcnet_activation"] = "linear"
+
+        # Fake CartPole episode of n timesteps.
+        train_batch = {
+            SampleBatch.CUR_OBS: np.array([
+                [0.1, 0.2, 0.3, 0.4],
+                [0.5, 0.6, 0.7, 0.8],
+                [0.9, 1.0, 1.1, 1.2]
+            ]),
+            SampleBatch.ACTIONS: np.array([0, 1, 1]),
+            SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]),
+            SampleBatch.DONES: np.array([False, False, True])
+        }
+
+        # tf.
+        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
+        policy = trainer.get_policy()
+        vars = policy.model.trainable_variables()
+
+        # Post-process (calculate simple (non-GAE) advantages) and attach to
+        # train_batch dict.
+        # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] =
+        # [2.9701, 1.99, 1.0]
+        train_batch = pg.post_process_advantages(policy, train_batch)
+        # Check Advantage values.
+        check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0])
+
+        # Actual loss results.
+        results = pg.pg_tf_loss(
+            policy, policy.model, dist_class=Categorical,
+            train_batch=train_batch
+        )
+
+        # Calculate expected results.
+        expected_logits = fc(
+            fc(
+                train_batch[SampleBatch.CUR_OBS],
+                vars[0].numpy(), vars[1].numpy()
+            ),
+            vars[2].numpy(), vars[3].numpy()
+        )
+        expected_logp = Categorical(expected_logits, policy.model).logp(
+            train_batch[SampleBatch.ACTIONS]
+        )
+        expected_loss = -np.mean(
+            expected_logp * train_batch[Postprocessing.ADVANTAGES]
+        )
+        check(results.numpy(), expected_loss, decimals=4)
+
+        # Torch.
+        config["use_pytorch"] = True
+        trainer = pg.PGTrainer(config=config, env="CartPole-v0")
+        policy = trainer.get_policy()
+        train_batch = policy._lazy_tensor_dict(train_batch)
+        results = pg.pg_torch_loss(
+            policy, policy.model, dist_class=TorchCategorical,
+            train_batch=train_batch
+        )
+        expected_logits = policy.model._last_output
+        expected_logp = TorchCategorical(expected_logits, policy.model).logp(
+            train_batch[SampleBatch.ACTIONS]
+        )
+        expected_loss = -np.mean(
+            expected_logp.detach().numpy() *
+            train_batch[Postprocessing.ADVANTAGES].numpy()
+        )
+        check(results.detach().numpy(), expected_loss, decimals=4)
diff --git a/rllib/agents/pg/torch_pg_policy.py b/rllib/agents/pg/torch_pg_policy.py
deleted file mode 100644
index 362a86b9e..000000000
--- a/rllib/agents/pg/torch_pg_policy.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import ray
-from ray.rllib.evaluation.postprocessing import compute_advantages, \
-    Postprocessing
-from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.torch_policy_template import build_torch_policy
-
-
-def pg_torch_loss(policy, model, dist_class, train_batch):
-    logits, _ = model.from_batch(train_batch)
-    action_dist = dist_class(logits, model)
-    log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS])
-    # save the error in the policy object
-    policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot(
-        log_probs.reshape(-1))
-    return policy.pi_err
-
-
-def postprocess_advantages(policy,
-                           sample_batch,
-                           other_agent_batches=None,
-                           episode=None):
-    return compute_advantages(
-        sample_batch, 0.0, policy.config["gamma"], use_gae=False)
-
-
-def pg_loss_stats(policy, train_batch):
-    # the error is recorded when computing the loss
-    return {"policy_loss": policy.pi_err.item()}
-
-
-PGTorchPolicy = build_torch_policy(
-    name="PGTorchPolicy",
-    get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG,
-    loss_fn=pg_torch_loss,
-    stats_fn=pg_loss_stats,
-    postprocess_fn=postprocess_advantages)
diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py
index f40521ddd..2d2a42899 100644
--- a/rllib/evaluation/rollout_worker.py
+++ b/rllib/evaluation/rollout_worker.py
@@ -666,10 +666,18 @@ class RolloutWorker(EvaluatorInterface):
 
     @DeveloperAPI
     def foreach_trainable_policy(self, func):
-        """Apply the given function to each (policy, policy_id) tuple.
+        """
+        Applies the given function to each (policy, policy_id) tuple, which
+        can be found in `self.policies_to_train`.
 
-        This only applies func to policies in `self.policies_to_train`."""
+        Args:
+            func (callable): A function - taking a Policy and its ID - that is
+                called on all Policies within `self.policies_to_train`.
 
+        Returns:
+            List[any]: The list of n return values of all
+                `func([policy], [ID])`-calls.
+        """
         return [
             func(policy, pid) for pid, policy in self.policy_map.items()
             if pid in self.policies_to_train
diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py
index eefa8085a..411d419b1 100644
--- a/rllib/examples/rock_paper_scissors_multiagent.py
+++ b/rllib/examples/rock_paper_scissors_multiagent.py
@@ -15,7 +15,7 @@ from gym.spaces import Discrete
 
 from ray import tune
 from ray.rllib.agents.pg.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 from ray.rllib.policy.policy import Policy
 from ray.rllib.env.multi_agent_env import MultiAgentEnv
 from ray.rllib.utils import try_import_tf
diff --git a/rllib/optimizers/async_samples_optimizer.py b/rllib/optimizers/async_samples_optimizer.py
index d102b7870..8eab14f3f 100644
--- a/rllib/optimizers/async_samples_optimizer.py
+++ b/rllib/optimizers/async_samples_optimizer.py
@@ -26,7 +26,6 @@ class AsyncSamplesOptimizer(PolicyOptimizer):
     This class coordinates the data transfers between the learner thread
     and remote workers (IMPALA actors).
     """
-
     def __init__(self,
                  workers,
                  train_batch_size=500,
diff --git a/rllib/optimizers/segment_tree.py b/rllib/optimizers/segment_tree.py
index 64e8b29f7..8f0a0b5a5 100644
--- a/rllib/optimizers/segment_tree.py
+++ b/rllib/optimizers/segment_tree.py
@@ -142,5 +142,4 @@ class MinSegmentTree(SegmentTree):
 
     def min(self, start=0, end=None):
         """Returns min(arr[start], ...,  arr[end])"""
-
         return super(MinSegmentTree, self).reduce(start, end)
diff --git a/rllib/tests/agents/__init__.py b/rllib/tests/agents/__init__.py
new file mode 100644
index 000000000..878841c18
--- /dev/null
+++ b/rllib/tests/agents/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/rllib/tests/agents/functionality/__init__.py b/rllib/tests/agents/functionality/__init__.py
new file mode 100644
index 000000000..878841c18
--- /dev/null
+++ b/rllib/tests/agents/functionality/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
diff --git a/rllib/tests/test_external_multi_agent_env.py b/rllib/tests/test_external_multi_agent_env.py
index be232c0bf..91a087c6b 100644
--- a/rllib/tests/test_external_multi_agent_env.py
+++ b/rllib/tests/test_external_multi_agent_env.py
@@ -8,7 +8,7 @@ import random
 import unittest
 
 import ray
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 from ray.rllib.optimizers import SyncSamplesOptimizer
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
 from ray.rllib.evaluation.worker_set import WorkerSet
diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py
index 359be1404..e189dcce3 100644
--- a/rllib/tests/test_io.py
+++ b/rllib/tests/test_io.py
@@ -15,7 +15,7 @@ import unittest
 
 import ray
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 from ray.rllib.evaluation import SampleBatch
 from ray.rllib.offline import IOContext, JsonWriter, JsonReader
 from ray.rllib.offline.json_writer import _to_json
diff --git a/rllib/tests/test_multi_agent_env.py b/rllib/tests/test_multi_agent_env.py
index e69ba6b1f..17e333f07 100644
--- a/rllib/tests/test_multi_agent_env.py
+++ b/rllib/tests/test_multi_agent_env.py
@@ -8,7 +8,7 @@ import unittest
 
 import ray
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
                                   AsyncGradientsOptimizer)
diff --git a/rllib/tests/test_nested_spaces.py b/rllib/tests/test_nested_spaces.py
index d3283d54d..1127565dd 100644
--- a/rllib/tests/test_nested_spaces.py
+++ b/rllib/tests/test_nested_spaces.py
@@ -12,7 +12,7 @@ import unittest
 import ray
 from ray.rllib.agents.a3c import A2CTrainer
 from ray.rllib.agents.pg import PGTrainer
-from ray.rllib.agents.pg.pg_policy import PGTFPolicy
+from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 from ray.rllib.env import MultiAgentEnv
 from ray.rllib.env.base_env import BaseEnv
 from ray.rllib.env.vector_env import VectorEnv