[RLlib] Implement PPO torch version. (#6826)

2026-06-27 20:22:39 +08:00 · 2020-01-21 08:06:50 +01:00
parent 574abe844a
commit c957ed58ed
25 changed files with 555 additions and 93 deletions
@@ -34,6 +34,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    --stop '{"training_iteration": 1}' \
    --config '{"num_workers": 2}'

+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/agents/ppo/tests/test_ppo.py
+
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output /ray/rllib/train.py \
    --env CartPole-v1 \
@@ -305,9 +308,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output python /ray/rllib/tests/test_dependency.py

-docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output python /ray/rllib/tests/test_legacy.py
-
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output python /ray/rllib/tests/test_io.py

@@ -365,9 +365,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output python /ray/rllib/tests/test_supported_spaces.py

-docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
-    /ray/ci/suppress_output python /ray/rllib/tests/test_env_with_subprocess.py
-
 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output /ray/rllib/tests/test_rollout.sh

@@ -493,3 +490,6 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \

 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
    /ray/ci/suppress_output python /ray/rllib/examples/custom_keras_rnn_model.py --run=PPO --stop=50 --env=RepeatInitialEnv
+
+docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \
+    /ray/ci/suppress_output python /ray/rllib/tests/test_env_with_subprocess.py
@@ -537,7 +537,7 @@ You can use the ``with_updates`` method on Trainers and Policy objects built wit
 .. code-block:: python

    from ray.rllib.agents.ppo import PPOTrainer
-    from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
+    from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy

    CustomPolicy = PPOTFPolicy.with_updates(
        name="MyCustomPPOTFPolicy",
@@ -1,7 +1,4 @@
 from ray.rllib.agents.ppo.ppo import PPOTrainer, DEFAULT_CONFIG
 from ray.rllib.agents.ppo.appo import APPOTrainer
-from ray.rllib.utils import renamed_agent

-PPOAgent = renamed_agent(PPOTrainer)
-
-__all__ = ["PPOAgent", "APPOTrainer", "PPOTrainer", "DEFAULT_CONFIG"]
+__all__ = ["APPOTrainer", "PPOTrainer", "DEFAULT_CONFIG"]
@@ -16,7 +16,7 @@ from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.utils import try_import_tf
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.policy.tf_policy import LearningRateSchedule, TFPolicy
-from ray.rllib.agents.ppo.ppo_policy import KLCoeffMixin, ValueNetworkMixin
+from ray.rllib.agents.ppo.ppo_tf_policy import KLCoeffMixin, ValueNetworkMixin
 from ray.rllib.models import ModelCatalog
 from ray.rllib.utils.annotations import override
 from ray.rllib.utils.explained_variance import explained_variance
@@ -1,12 +1,13 @@
 import logging

 from ray.rllib.agents import with_common_config
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.optimizers import SyncSamplesOptimizer, LocalMultiGPUOptimizer
 from ray.rllib.utils import try_import_tf

 tf = try_import_tf()
+
 logger = logging.getLogger(__name__)

 # yapf: disable
@@ -63,6 +64,8 @@ DEFAULT_CONFIG = with_common_config({
    # usually slower, but you might want to try it if you run into issues with
    # the default optimizer.
    "simple_optimizer": False,
+    # Use PyTorch as framework?
+    "use_pytorch": False
 })
 # __sphinx_doc_end__
 # yapf: enable
@@ -138,23 +141,18 @@ def warn_about_bad_reward_scales(trainer, result):
            "This means that it will take more than "
            "{} iterations for your value ".format(rew_scale) +
            "function to converge. If this is not intended, consider "
-            "increasing `vf_clip_param`."
-        )
+            "increasing `vf_clip_param`.")


 def validate_config(config):
-    # PyTorch check.
-    if config["use_pytorch"]:
-        raise ValueError("PPO does not support PyTorch yet! Use tf instead.")
    if config["entropy_coeff"] < 0:
        raise DeprecationWarning("entropy_coeff must be >= 0")
    if isinstance(config["entropy_coeff"], int):
        config["entropy_coeff"] = float(config["entropy_coeff"])
    if config["sgd_minibatch_size"] > config["train_batch_size"]:
        raise ValueError(
-            "Minibatch size {} must be <= train batch size {}.".
-            format(config["sgd_minibatch_size"], config["train_batch_size"])
-        )
+            "Minibatch size {} must be <= train batch size {}.".format(
+                config["sgd_minibatch_size"], config["train_batch_size"]))
    if config["batch_mode"] == "truncate_episodes" and not config["use_gae"]:
        raise ValueError(
            "Episode truncation is not supported without a value "
@@ -168,14 +166,23 @@ def validate_config(config):
        logger.warning(
            "Using the simple minibatch optimizer. This will significantly "
            "reduce performance, consider simple_optimizer=False.")
-    elif tf and tf.executing_eagerly():
+    elif config["use_pytorch"] or (tf and tf.executing_eagerly()):
        config["simple_optimizer"] = True  # multi-gpu not supported


+def get_policy_class(config):
+    if config.get("use_pytorch") is True:
+        from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
+        return PPOTorchPolicy
+    else:
+        return PPOTFPolicy
+
+
 PPOTrainer = build_trainer(
    name="PPO",
    default_config=DEFAULT_CONFIG,
    default_policy=PPOTFPolicy,
+    get_policy_class=get_policy_class,
    make_policy_optimizer=choose_policy_optimizer,
    validate_config=validate_config,
    after_optimizer_step=update_kl,
@@ -1,11 +1,13 @@
 import logging

 import ray
+from ray.rllib.agents.impala.vtrace_policy import BEHAVIOUR_LOGITS
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
    Postprocessing
 from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.policy import ACTION_LOGP
 from ray.rllib.policy.tf_policy import LearningRateSchedule, \
-    EntropyCoeffSchedule, ACTION_LOGP
+    EntropyCoeffSchedule
 from ray.rllib.policy.tf_policy_template import build_tf_policy
 from ray.rllib.utils.explained_variance import explained_variance
 from ray.rllib.utils.tf_ops import make_tf_callable
@@ -15,13 +17,9 @@ tf = try_import_tf()

 logger = logging.getLogger(__name__)

-# Frozen logits of the policy that computed the action
-BEHAVIOUR_LOGITS = "behaviour_logits"
-

 class PPOLoss:
    def __init__(self,
-                 action_space,
                 dist_class,
                 model,
                 value_targets,
@@ -38,12 +36,10 @@ class PPOLoss:
                 clip_param=0.1,
                 vf_clip_param=0.1,
                 vf_loss_coeff=1.0,
-                 use_gae=True,
-                 model_config=None):
+                 use_gae=True):
        """Constructs the loss for Proximal Policy Objective.

        Arguments:
-            action_space: Environment observation space specification.
            dist_class: action distribution class for logits.
            value_targets (Placeholder): Placeholder for target values; used
                for GAE.
@@ -53,27 +49,32 @@ class PPOLoss:
                from previous model evaluation.
            prev_logits (Placeholder): Placeholder for logits output from
                previous model evaluation.
-            prev_actions_logp (Placeholder): Placeholder for prob output from
-                previous model evaluation.
+            prev_actions_logp (Placeholder): Placeholder for action prob output
+                from the previous (before update) Model evaluation.
            vf_preds (Placeholder): Placeholder for value function output
-                from previous model evaluation.
+                from the previous (before update) Model evaluation.
            curr_action_dist (ActionDistribution): ActionDistribution
                of the current model.
            value_fn (Tensor): Current value function output Tensor.
            cur_kl_coeff (Variable): Variable holding the current PPO KL
                coefficient.
-            valid_mask (Tensor): A bool mask of valid input elements (#2992).
+            valid_mask (Optional[tf.Tensor]): An optional bool mask of valid
+                input elements (for max-len padded sequences (RNNs)).
            entropy_coeff (float): Coefficient of the entropy regularizer.
            clip_param (float): Clip parameter
            vf_clip_param (float): Clip parameter for the value function
            vf_loss_coeff (float): Coefficient of the value function loss
            use_gae (bool): If true, use the Generalized Advantage Estimator.
-            model_config (dict): (Optional) model config for use in specifying
-                action distributions.
        """
+        if valid_mask is not None:

-        def reduce_mean_valid(t):
-            return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
+            def reduce_mean_valid(t):
+                return tf.reduce_mean(tf.boolean_mask(t, valid_mask))
+
+        else:
+
+            def reduce_mean_valid(t):
+                return tf.reduce_mean(t)

        prev_dist = dist_class(prev_logits, model)
        # Make loss functions.
@@ -112,16 +113,13 @@ def ppo_surrogate_loss(policy, model, dist_class, train_batch):
    logits, state = model.from_batch(train_batch)
    action_dist = dist_class(logits, model)

+    mask = None
    if state:
        max_seq_len = tf.reduce_max(train_batch["seq_lens"])
        mask = tf.sequence_mask(train_batch["seq_lens"], max_seq_len)
        mask = tf.reshape(mask, [-1])
-    else:
-        mask = tf.ones_like(
-            train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool)

    policy.loss_obj = PPOLoss(
-        policy.action_space,
        dist_class,
        model,
        train_batch[Postprocessing.VALUE_TARGETS],
@@ -139,7 +137,7 @@ def ppo_surrogate_loss(policy, model, dist_class, train_batch):
        vf_clip_param=policy.config["vf_clip_param"],
        vf_loss_coeff=policy.config["vf_loss_coeff"],
        use_gae=policy.config["use_gae"],
-        model_config=policy.config["model"])
+    )

    return policy.loss_obj.loss

@@ -0,0 +1,223 @@
+import logging
+
+import ray
+from ray.rllib.agents.impala.vtrace_policy import BEHAVIOUR_LOGITS
+from ray.rllib.agents.a3c.a3c_torch_policy import apply_grad_clipping
+from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae, \
+    setup_config
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.policy.policy import ACTION_LOGP
+from ray.rllib.policy.torch_policy import EntropyCoeffSchedule, \
+    LearningRateSchedule
+from ray.rllib.policy.torch_policy_template import build_torch_policy
+from ray.rllib.utils.explained_variance import explained_variance
+from ray.rllib.utils.torch_ops import sequence_mask
+from ray.rllib.utils import try_import_torch
+
+torch, nn = try_import_torch()
+
+logger = logging.getLogger(__name__)
+
+
+class PPOLoss:
+    def __init__(self,
+                 dist_class,
+                 model,
+                 value_targets,
+                 advantages,
+                 actions,
+                 prev_logits,
+                 prev_actions_logp,
+                 vf_preds,
+                 curr_action_dist,
+                 value_fn,
+                 cur_kl_coeff,
+                 valid_mask,
+                 entropy_coeff=0,
+                 clip_param=0.1,
+                 vf_clip_param=0.1,
+                 vf_loss_coeff=1.0,
+                 use_gae=True):
+        """Constructs the loss for Proximal Policy Objective.
+
+        Arguments:
+            dist_class: action distribution class for logits.
+            value_targets (Placeholder): Placeholder for target values; used
+                for GAE.
+            actions (Placeholder): Placeholder for actions taken
+                from previous model evaluation.
+            advantages (Placeholder): Placeholder for calculated advantages
+                from previous model evaluation.
+            prev_logits (Placeholder): Placeholder for logits output from
+                previous model evaluation.
+            prev_actions_logp (Placeholder): Placeholder for prob output from
+                previous model evaluation.
+            vf_preds (Placeholder): Placeholder for value function output
+                from previous model evaluation.
+            curr_action_dist (ActionDistribution): ActionDistribution
+                of the current model.
+            value_fn (Tensor): Current value function output Tensor.
+            cur_kl_coeff (Variable): Variable holding the current PPO KL
+                coefficient.
+            valid_mask (Tensor): A bool mask of valid input elements (#2992).
+            entropy_coeff (float): Coefficient of the entropy regularizer.
+            clip_param (float): Clip parameter
+            vf_clip_param (float): Clip parameter for the value function
+            vf_loss_coeff (float): Coefficient of the value function loss
+            use_gae (bool): If true, use the Generalized Advantage Estimator.
+        """
+
+        def reduce_mean_valid(t):
+            return torch.mean(t * valid_mask)
+
+        prev_dist = dist_class(prev_logits, model)
+        # Make loss functions.
+        logp_ratio = torch.exp(
+            curr_action_dist.logp(actions) - prev_actions_logp)
+        action_kl = prev_dist.kl(curr_action_dist)
+        self.mean_kl = reduce_mean_valid(action_kl)
+
+        curr_entropy = curr_action_dist.entropy()
+        self.mean_entropy = reduce_mean_valid(curr_entropy)
+
+        surrogate_loss = torch.min(
+            advantages * logp_ratio,
+            advantages * torch.clamp(logp_ratio, 1 - clip_param,
+                                     1 + clip_param))
+        self.mean_policy_loss = reduce_mean_valid(-surrogate_loss)
+
+        if use_gae:
+            vf_loss1 = torch.pow(value_fn - value_targets, 2.0)
+            vf_clipped = vf_preds + torch.clamp(value_fn - vf_preds,
+                                                -vf_clip_param, vf_clip_param)
+            vf_loss2 = torch.pow(vf_clipped - value_targets, 2.0)
+            vf_loss = torch.max(vf_loss1, vf_loss2)
+            self.mean_vf_loss = reduce_mean_valid(vf_loss)
+            loss = reduce_mean_valid(
+                -surrogate_loss + cur_kl_coeff * action_kl +
+                vf_loss_coeff * vf_loss - entropy_coeff * curr_entropy)
+        else:
+            self.mean_vf_loss = 0.0
+            loss = reduce_mean_valid(-surrogate_loss +
+                                     cur_kl_coeff * action_kl -
+                                     entropy_coeff * curr_entropy)
+        self.loss = loss
+
+
+def ppo_surrogate_loss(policy, model, dist_class, train_batch):
+    logits, state = model.from_batch(train_batch)
+    action_dist = dist_class(logits, model)
+
+    if state:
+        max_seq_len = torch.max(train_batch["seq_lens"])
+        mask = sequence_mask(train_batch["seq_lens"], max_seq_len)
+        mask = torch.reshape(mask, [-1])
+    else:
+        mask = torch.ones_like(
+            train_batch[Postprocessing.ADVANTAGES], dtype=torch.bool)
+
+    policy.loss_obj = PPOLoss(
+        dist_class,
+        model,
+        train_batch[Postprocessing.VALUE_TARGETS],
+        train_batch[Postprocessing.ADVANTAGES],
+        train_batch[SampleBatch.ACTIONS],
+        train_batch[BEHAVIOUR_LOGITS],
+        train_batch[ACTION_LOGP],
+        train_batch[SampleBatch.VF_PREDS],
+        action_dist,
+        model.value_function(),
+        policy.kl_coeff,
+        mask,
+        entropy_coeff=policy.entropy_coeff,
+        clip_param=policy.config["clip_param"],
+        vf_clip_param=policy.config["vf_clip_param"],
+        vf_loss_coeff=policy.config["vf_loss_coeff"],
+        use_gae=policy.config["use_gae"],
+    )
+
+    return policy.loss_obj.loss
+
+
+def kl_and_loss_stats(policy, train_batch):
+    return {
+        "cur_kl_coeff": policy.kl_coeff,
+        "cur_lr": policy.cur_lr,
+        "total_loss": policy.loss_obj.loss.cpu().detach().numpy(),
+        "policy_loss": policy.loss_obj.mean_policy_loss.cpu().detach().numpy(),
+        "vf_loss": policy.loss_obj.mean_vf_loss.cpu().detach().numpy(),
+        "vf_explained_var": explained_variance(
+            train_batch[Postprocessing.VALUE_TARGETS],
+            policy.model.value_function(),
+            framework="torch").cpu().detach().numpy(),
+        "kl": policy.loss_obj.mean_kl.cpu().detach().numpy(),
+        "entropy": policy.loss_obj.mean_entropy.cpu().detach().numpy(),
+        "entropy_coeff": policy.entropy_coeff,
+    }
+
+
+def vf_preds_and_logits_fetches(policy, input_dict, state_batches, model,
+                                action_dist):
+    """Adds value function and logits outputs to experience train_batches."""
+    return {
+        SampleBatch.VF_PREDS: policy.model.value_function(),
+        BEHAVIOUR_LOGITS: policy.model.last_output().numpy(),
+        ACTION_LOGP: action_dist.logp(input_dict[SampleBatch.ACTIONS])
+    }
+
+
+class KLCoeffMixin:
+    def __init__(self, config):
+        # KL Coefficient.
+        self.kl_coeff = config["kl_coeff"]
+        self.kl_target = config["kl_target"]
+
+    def update_kl(self, sampled_kl):
+        if sampled_kl > 2.0 * self.kl_target:
+            self.kl_coeff *= 1.5
+        elif sampled_kl < 0.5 * self.kl_target:
+            self.kl_coeff *= 0.5
+        return self.kl_coeff
+
+
+class ValueNetworkMixin:
+    def __init__(self, obs_space, action_space, config):
+        if config["use_gae"]:
+
+            def value(ob, prev_action, prev_reward, *state):
+                model_out, _ = self.model({
+                    SampleBatch.CUR_OBS: torch.Tensor([ob]),
+                    SampleBatch.PREV_ACTIONS: torch.Tensor([prev_action]),
+                    SampleBatch.PREV_REWARDS: torch.Tensor([prev_reward]),
+                    "is_training": False,
+                }, [torch.Tensor([s]) for s in state], torch.Tensor([1]))
+                return self.model.value_function()[0]
+
+        else:
+
+            def value(ob, prev_action, prev_reward, *state):
+                return 0.0
+
+        self._value = value
+
+
+def setup_mixins(policy, obs_space, action_space, config):
+    ValueNetworkMixin.__init__(policy, obs_space, action_space, config)
+    KLCoeffMixin.__init__(policy, config)
+    EntropyCoeffSchedule.__init__(policy, config["entropy_coeff"],
+                                  config["entropy_coeff_schedule"])
+    LearningRateSchedule.__init__(policy, config["lr"], config["lr_schedule"])
+
+
+PPOTorchPolicy = build_torch_policy(
+    name="PPOTorchPolicy",
+    get_default_config=lambda: ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG,
+    loss_fn=ppo_surrogate_loss,
+    stats_fn=kl_and_loss_stats,
+    extra_action_out_fn=vf_preds_and_logits_fetches,
+    postprocess_fn=postprocess_ppo_gae,
+    extra_grad_process_fn=apply_grad_clipping,
+    before_init=setup_config,
+    after_init=setup_mixins,
+    mixins=[KLCoeffMixin, ValueNetworkMixin])
@@ -0,0 +1,182 @@
+import numpy as np
+import unittest
+
+import ray
+from ray.rllib.agents.impala.vtrace_policy import BEHAVIOUR_LOGITS
+import ray.rllib.agents.ppo as ppo
+from ray.rllib.agents.ppo.ppo_tf_policy import postprocess_ppo_gae as \
+    postprocess_ppo_gae_tf, ppo_surrogate_loss as ppo_surrogate_loss_tf
+from ray.rllib.agents.ppo.ppo_torch_policy import postprocess_ppo_gae as \
+    postprocess_ppo_gae_torch, ppo_surrogate_loss as ppo_surrogate_loss_torch
+from ray.rllib.evaluation.postprocessing import Postprocessing
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.policy.policy import ACTION_LOGP
+from ray.rllib.policy.sample_batch import SampleBatch
+from ray.rllib.utils.numpy import fc
+from ray.rllib.utils.test_utils import check
+
+
+class TestPPO(unittest.TestCase):
+
+    ray.init()
+
+    def test_ppo_compilation(self):
+        """Test whether a PPOTrainer can be built with both frameworks."""
+        config = ppo.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+
+        # tf.
+        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
+
+        num_iterations = 2
+        for i in range(num_iterations):
+            trainer.train()
+
+        # Torch.
+        config["use_pytorch"] = True
+        config["simple_optimizer"] = True
+        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
+        for i in range(num_iterations):
+            trainer.train()
+
+    def test_ppo_loss_function(self):
+        """Tests the PPO loss function math."""
+        config = ppo.DEFAULT_CONFIG.copy()
+        config["num_workers"] = 0  # Run locally.
+        config["eager"] = True
+        config["gamma"] = 0.99
+        config["model"]["fcnet_hiddens"] = [10]
+        config["model"]["fcnet_activation"] = "linear"
+
+        # Fake CartPole episode of n time steps.
+        train_batch = {
+            SampleBatch.CUR_OBS: np.array(
+                [[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8],
+                 [0.9, 1.0, 1.1, 1.2]],
+                dtype=np.float32),
+            SampleBatch.ACTIONS: np.array([0, 1, 1]),
+            SampleBatch.REWARDS: np.array([1.0, -1.0, .5], dtype=np.float32),
+            SampleBatch.DONES: np.array([False, False, True]),
+            SampleBatch.VF_PREDS: np.array([0.5, 0.6, 0.7], dtype=np.float32),
+            BEHAVIOUR_LOGITS: np.array(
+                [[-2., 0.5], [-3., -0.3], [-0.1, 2.5]], dtype=np.float32),
+            ACTION_LOGP: np.array([-0.5, -0.1, -0.2], dtype=np.float32)
+        }
+
+        # tf.
+        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
+        policy = trainer.get_policy()
+
+        # Post-process (calculate simple (non-GAE) advantages) and attach to
+        # train_batch dict.
+        # A = [0.99^2 * 0.5 + 0.99 * -1.0 + 1.0, 0.99 * 0.5 - 1.0, 0.5] =
+        # [0.50005, -0.505, 0.5]
+        train_batch = postprocess_ppo_gae_tf(policy, train_batch)
+        # Check Advantage values.
+        check(train_batch[Postprocessing.VALUE_TARGETS],
+              [0.50005, -0.505, 0.5])
+
+        # Calculate actual PPO loss (results are stored in policy.loss_obj) for
+        # tf.
+        ppo_surrogate_loss_tf(policy, policy.model, Categorical, train_batch)
+
+        vars = policy.model.trainable_variables()
+        expected_logits = fc(
+            fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(),
+               vars[1].numpy()), vars[4].numpy(), vars[5].numpy())
+        expected_value_outs = fc(
+            fc(train_batch[SampleBatch.CUR_OBS], vars[2].numpy(),
+               vars[3].numpy()), vars[6].numpy(), vars[7].numpy())
+
+        kl, entropy, pg_loss, vf_loss, overall_loss = \
+            self._ppo_loss_helper(
+                policy, policy.model, Categorical, train_batch,
+                expected_logits, expected_value_outs
+            )
+        check(kl, policy.loss_obj.mean_kl)
+        check(entropy, policy.loss_obj.mean_entropy)
+        check(np.mean(-pg_loss), policy.loss_obj.mean_policy_loss)
+        check(np.mean(vf_loss), policy.loss_obj.mean_vf_loss, decimals=4)
+        check(policy.loss_obj.loss.numpy(), overall_loss, decimals=4)
+
+        # Torch.
+        config["use_pytorch"] = True
+        trainer = ppo.PPOTrainer(config=config, env="CartPole-v0")
+        policy = trainer.get_policy()
+        train_batch = postprocess_ppo_gae_torch(policy, train_batch)
+        train_batch = policy._lazy_tensor_dict(train_batch)
+
+        # Check Advantage values.
+        check(train_batch[Postprocessing.VALUE_TARGETS],
+              [0.50005, -0.505, 0.5])
+
+        # Calculate actual PPO loss (results are stored in policy.loss_obj)
+        # for tf.
+        ppo_surrogate_loss_torch(policy, policy.model, TorchCategorical,
+                                 train_batch)
+
+        kl, entropy, pg_loss, vf_loss, overall_loss = \
+            self._ppo_loss_helper(
+                policy, policy.model, TorchCategorical, train_batch,
+                policy.model._last_output,
+                policy.model.value_function().detach().numpy()
+            )
+        check(kl, policy.loss_obj.mean_kl.detach().numpy())
+        check(entropy, policy.loss_obj.mean_entropy.detach().numpy())
+        check(
+            np.mean(-pg_loss),
+            policy.loss_obj.mean_policy_loss.detach().numpy())
+        check(
+            np.mean(vf_loss),
+            policy.loss_obj.mean_vf_loss.detach().numpy(),
+            decimals=4)
+        check(policy.loss_obj.loss.detach().numpy(), overall_loss, decimals=4)
+
+    def _ppo_loss_helper(self, policy, model, dist_class, train_batch, logits,
+                         vf_outs):
+        """
+        Calculates the expected PPO loss (components) given Policy,
+        Model, distribution, some batch, logits & vf outputs, using numpy.
+        """
+        # Calculate expected PPO loss results.
+        dist = dist_class(logits, policy.model)
+        dist_prev = dist_class(train_batch[BEHAVIOUR_LOGITS], policy.model)
+        expected_logp = dist.logp(train_batch[SampleBatch.ACTIONS])
+        if isinstance(model, TorchModelV2):
+            expected_rho = np.exp(expected_logp.detach().numpy() -
+                                  train_batch.get(ACTION_LOGP))
+            # KL(prev vs current action dist)-loss component.
+            kl = np.mean(dist_prev.kl(dist).detach().numpy())
+            # Entropy-loss component.
+            entropy = np.mean(dist.entropy().detach().numpy())
+        else:
+            expected_rho = np.exp(expected_logp - train_batch[ACTION_LOGP])
+            # KL(prev vs current action dist)-loss component.
+            kl = np.mean(dist_prev.kl(dist))
+            # Entropy-loss component.
+            entropy = np.mean(dist.entropy())
+
+        # Policy loss component.
+        pg_loss = np.minimum(
+            train_batch.get(Postprocessing.ADVANTAGES) * expected_rho,
+            train_batch.get(Postprocessing.ADVANTAGES) * np.clip(
+                expected_rho, 1 - policy.config["clip_param"],
+                1 + policy.config["clip_param"]))
+
+        # Value function loss component.
+        vf_loss1 = np.power(
+            vf_outs - train_batch.get(Postprocessing.VALUE_TARGETS), 2.0)
+        vf_clipped = train_batch.get(SampleBatch.VF_PREDS) + np.clip(
+            vf_outs - train_batch.get(SampleBatch.VF_PREDS),
+            -policy.config["vf_clip_param"], policy.config["vf_clip_param"])
+        vf_loss2 = np.power(
+            vf_clipped - train_batch.get(Postprocessing.VALUE_TARGETS), 2.0)
+        vf_loss = np.maximum(vf_loss1, vf_loss2)
+
+        # Overall loss.
+        overall_loss = np.mean(-pg_loss + policy.kl_coeff * kl +
+                               policy.config["vf_loss_coeff"] * vf_loss -
+                               policy.entropy_coeff * entropy)
+        return kl, entropy, pg_loss, vf_loss, overall_loss
@@ -17,7 +17,8 @@ class Postprocessing:

@DeveloperAPI
 def compute_advantages(rollout, last_r, gamma=0.9, lambda_=1.0, use_gae=True):
-    """Given a rollout, compute its value targets and the advantage.
+    """
+    Given a rollout, compute its value targets and the advantage.

    Args:
        rollout (SampleBatch): SampleBatch of a single trajectory
@@ -43,7 +44,7 @@ def compute_advantages(rollout, last_r, gamma=0.9, lambda_=1.0, use_gae=True):
             np.array([last_r])])
        delta_t = (
            traj[SampleBatch.REWARDS] + gamma * vpred_t[1:] - vpred_t[:-1])
-        # This formula for the advantage comes
+        # This formula for the advantage comes from:
        # "Generalized Advantage Estimation": https://arxiv.org/abs/1506.02438
        traj[Postprocessing.ADVANTAGES] = discount(delta_t, gamma * lambda_)
        traj[Postprocessing.VALUE_TARGETS] = (
@@ -17,9 +17,10 @@ import numpy as np
 from gym.spaces import Discrete

 from ray import tune
+from ray.rllib.agents.impala.vtrace_policy import BEHAVIOUR_LOGITS
 from ray.rllib.agents.ppo.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy, KLCoeffMixin, \
-    PPOLoss, BEHAVIOUR_LOGITS
+from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, KLCoeffMixin, \
+    PPOLoss
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
    Postprocessing
 from ray.rllib.examples.twostep_game import TwoStepGame
@@ -15,7 +15,7 @@ import ray
 from ray.rllib.agents.dqn.dqn import DQNTrainer
 from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy
 from ray.rllib.agents.ppo.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.tests.test_multi_agent_env import MultiCartpole
 from ray.tune.logger import pretty_print
 from ray.tune.registry import register_env
@@ -24,6 +24,7 @@ from ray.rllib.models.modelv2 import ModelV2
 from ray.rllib.utils import try_import_tf
 from ray.rllib.utils.annotations import DeveloperAPI, PublicAPI
 from ray.rllib.utils.error import UnsupportedSpaceException
+from ray.rllib.utils.deprecation import deprecation_warning

 tf = try_import_tf()

@@ -105,20 +106,31 @@ class ModelCatalog:

    @staticmethod
    @DeveloperAPI
-    def get_action_dist(action_space, config, dist_type=None, torch=False):
-        """Returns action distribution class and size for the given action space.
+    def get_action_dist(
+        action_space, config, dist_type=None, torch=None,
+        framework="tf"
+    ):
+        """
+        Returns action distribution class and size for the given action space.

        Args:
            action_space (Space): Action space of the target gym env.
            config (dict): Optional model config.
            dist_type (str): Optional identifier of the action distribution.
-            torch (bool):  Optional whether to return PyTorch distribution.
+            torch (bool): Obsoleted: Whether to return PyTorch Model and
+                distribution (use framework="torch" instead).
+            framework (str): One of "tf" or "torch".

        Returns:
            dist_class (ActionDistribution): Python class of the distribution.
            dist_dim (int): The size of the input vector to the distribution.
        """
+        # Obsoleted parameter `torch`:
+        if torch is not None:
+            deprecation_warning("`torch` parameter", "`framework`='tf|torch'")
+            framework = "torch" if torch else "tf"

+        dist = None
        config = config or MODEL_DEFAULTS
        if config.get("custom_action_dist"):
            action_dist_name = config["custom_action_dist"]
@@ -135,15 +147,17 @@ class ModelCatalog:
                    "using a custom action distribution, "
                    "using a Tuple action space, or the multi-agent API.")
            if dist_type is None:
-                dist = TorchDiagGaussian if torch else DiagGaussian
+                dist = DiagGaussian if framework == "tf" else TorchDiagGaussian
            elif dist_type == "deterministic":
                dist = Deterministic
        elif isinstance(action_space, gym.spaces.Discrete):
-            dist = TorchCategorical if torch else Categorical
+            dist = Categorical if framework == "tf" else TorchCategorical
        elif isinstance(action_space, gym.spaces.Tuple):
-            if torch:
-                raise NotImplementedError("Tuple action spaces not supported "
-                                          "for Pytorch.")
+            if framework == "torch":
+                # TODO(sven): implement
+                raise NotImplementedError(
+                    "Tuple action spaces not supported for Pytorch."
+                )
            child_dist = []
            input_lens = []
            for action in action_space.spaces:
@@ -157,26 +171,33 @@ class ModelCatalog:
                action_space=action_space,
                input_lens=input_lens), sum(input_lens)
        elif isinstance(action_space, Simplex):
-            if torch:
-                raise NotImplementedError("Simplex action spaces not "
-                                          "supported for Pytorch.")
+            if framework == "torch":
+                # TODO(sven): implement
+                raise NotImplementedError(
+                    "Simplex action spaces not supported for Pytorch."
+                )
            dist = Dirichlet
        elif isinstance(action_space, gym.spaces.MultiDiscrete):
-            if torch:
-                raise NotImplementedError("MultiDiscrete action spaces not "
-                                          "supported for Pytorch.")
+            if framework == "torch":
+                # TODO(sven): implement
+                raise NotImplementedError(
+                    "MultiDiscrete action spaces not supported for Pytorch."
+                )
            return partial(MultiCategorical, input_lens=action_space.nvec), \
                int(sum(action_space.nvec))
        elif isinstance(action_space, gym.spaces.Dict):
+            # TODO(sven): implement
            raise NotImplementedError(
                "Dict action spaces are not supported, consider using "
-                "gym.spaces.Tuple instead")
+                "gym.spaces.Tuple instead"
+            )
+        else:
+            raise NotImplementedError(
+                "Unsupported args: {} {}".format(action_space, dist_type)
+            )

        return dist, dist.required_model_output_shape(action_space, config)

-        raise NotImplementedError("Unsupported args: {} {}".format(
-            action_space, dist_type))
-
    @staticmethod
    @DeveloperAPI
    def get_action_shape(action_space):
@@ -221,7 +221,7 @@ class Policy(metaclass=ABCMeta):
        Returns:
            weights (obj): Serializable copy or view of model weights
        """
-        pass
+        raise NotImplementedError

    @DeveloperAPI
    def set_weights(self, weights):
@@ -230,7 +230,7 @@ class Policy(metaclass=ABCMeta):
        Arguments:
            weights (obj): Serializable copy or view of model weights
        """
-        pass
+        raise NotImplementedError

    @DeveloperAPI
    def num_state_tensors(self):
@@ -72,9 +72,10 @@ class TorchPolicy(Policy):
            logits, state = model_out
            action_dist = self.dist_class(logits, self.model)
            actions = action_dist.sample()
+            input_dict[SampleBatch.ACTIONS] = actions
            return (actions.cpu().numpy(), [h.cpu().numpy() for h in state],
                    self.extra_action_out(input_dict, state_batches,
-                                          self.model))
+                                          self.model, action_dist))

    @override(Policy)
    def learn_on_batch(self, postprocessed_batch):
@@ -1,6 +1,7 @@
 from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.torch_policy import TorchPolicy
 from ray.rllib.models.catalog import ModelCatalog
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
 from ray.rllib.utils import add_mixins
 from ray.rllib.utils.annotations import override, DeveloperAPI

@@ -67,9 +68,13 @@ def build_torch_policy(name,
            if make_model_and_action_dist:
                self.model, self.dist_class = make_model_and_action_dist(
                    self, obs_space, action_space, config)
+                # Make sure, we passed in a correct Model factory.
+                assert isinstance(self.model, TorchModelV2), \
+                    "ERROR: TorchPolicy::make_model_and_action_dist must " \
+                    "return a TorchModelV2 object!"
            else:
                self.dist_class, logit_dim = ModelCatalog.get_action_dist(
-                    action_space, self.config["model"], torch=True)
+                    action_space, self.config["model"], framework="torch")
                self.model = ModelCatalog.get_model_v2(
                    obs_space,
                    action_space,
@@ -30,11 +30,11 @@ class EnvWithSubprocess(gym.Env):
        self.subproc = subprocess.Popen(UNIQUE_CMD.split(" "), shell=False)
        self.config = config
        # Exit handler should be called
+        atexit.register(lambda: self.subproc.kill())
        if config.worker_index == 0:
            atexit.register(lambda: os.unlink(UNIQUE_FILE_0))
        else:
            atexit.register(lambda: os.unlink(UNIQUE_FILE_1))
-        atexit.register(lambda: self.subproc.kill())

    def close(self):
        if self.config.worker_index == 0:
@@ -76,7 +76,7 @@ if __name__ == "__main__":
            },
        },
    })
-    time.sleep(5.0)
+    time.sleep(10.0)
    leaked = leaked_processes()
    assert not leaked, "LEAKED PROCESSES: {}".format(leaked)
    assert not os.path.exists(UNIQUE_FILE_0), "atexit handler not called"
@@ -1,11 +0,0 @@
-from ray.rllib.agents.ppo import PPOAgent
-from ray import tune
-import ray
-
-if __name__ == "__main__":
-    ray.init()
-    # Test legacy *Agent classes work (renamed to Trainer)
-    tune.run(
-        PPOAgent,
-        config={"env": "CartPole-v0"},
-        stop={"training_iteration": 2})
@@ -10,7 +10,7 @@ from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
                                  AsyncGradientsOptimizer)
 from ray.rllib.tests.test_rollout_worker import (MockEnv, MockEnv2, MockPolicy)
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
-from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tests.test_policy import TestPolicy
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.evaluation.worker_set import WorkerSet
 from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
@@ -441,7 +441,7 @@ class TestMultiAgentEnv(unittest.TestCase):
    def test_custom_rnn_state_values(self):
        h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}

-        class StatefulPolicy(Policy):
+        class StatefulPolicy(TestPolicy):
            def compute_actions(self,
                                obs_batch,
                                state_batches=None,
@@ -5,7 +5,7 @@ import unittest

 import ray
 from ray.rllib.agents.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
 from ray.rllib.evaluation import SampleBatch
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
 from ray.rllib.evaluation.worker_set import WorkerSet
@@ -10,14 +10,14 @@ from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.agents.a3c import A2CTrainer
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
 from ray.rllib.evaluation.metrics import collect_metrics
-from ray.rllib.policy.policy import Policy
+from ray.rllib.policy.tests.test_policy import TestPolicy
 from ray.rllib.evaluation.postprocessing import compute_advantages
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
 from ray.rllib.env.vector_env import VectorEnv
 from ray.tune.registry import register_env


-class MockPolicy(Policy):
+class MockPolicy(TestPolicy):
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
@@ -22,7 +22,7 @@ atari-impala:
            [20000000, 0.000000000001],
        ]
        num_gpus: 1
-atari-ppo:
+atari-ppo-tf:
    env: BreakoutNoFrameskip-v4
    run: PPO
    num_samples: 4
@@ -45,6 +45,30 @@ atari-ppo:
        observation_filter: NoFilter
        vf_share_layers: true
        num_gpus: 1
+atari-ppo-torch:
+    env: BreakoutNoFrameskip-v4
+    run: PPO
+    num_samples: 4
+    stop:
+        time_total_s: 3600
+    config:
+        use_pytorch: true,
+        lambda: 0.95
+        kl_coeff: 0.5
+        clip_rewards: True
+        clip_param: 0.1
+        vf_clip_param: 10.0
+        entropy_coeff: 0.01
+        train_batch_size: 5000
+        sample_batch_size: 100
+        sgd_minibatch_size: 500
+        num_sgd_iter: 10
+        num_workers: 10
+        num_envs_per_worker: 5
+        batch_mode: truncate_episodes
+        observation_filter: NoFilter
+        vf_share_layers: true
+        num_gpus: 1
 apex:
    env: BreakoutNoFrameskip-v4
    run: APEX
@@ -1,4 +1,4 @@
-cartpole-ppo:
+cartpole-ppo-tf:
    env: CartPole-v0
    run: PPO
    stop:
@@ -5,9 +5,16 @@ logger = logging.getLogger(__name__)


 def deprecation_warning(old, new=None):
+    """
+    Logs a deprecation warning via the `logger` object.
+
+    Args:
+        old (str): A description of the "thing" that is to be deprecated.
+        new (Optional[str]): A description of the new "thing" that replaces it.
+    """
    logger.warning(
-        "DeprecationWarning: `{}` has been deprecated.".format(old) +
-        (" Use `{}` instead." if new else "") +
+        "DeprecationWarning: `{}` has been deprecated.{}".
+        format(old, (" Use `{}` instead.".format(new) if new else "")) +
        " This will raise an error in the future!"
    )

@@ -12,4 +12,10 @@ def explained_variance(y, pred, framework="tf"):
    else:
        y_var = torch.var(y, dim=[0])
        diff_var = torch.var(y - pred, dim=[0])
-        return max(-1.0, 1 - (diff_var / y_var))
+        min_ = torch.Tensor([-1.0])
+        return torch.max(
+            min_.to(
+                device=torch.device("cuda")
+            ) if torch.cuda.is_available() else min_,
+            1 - (diff_var / y_var)
+        )