From f1b56fa5eead0468f8e31f13e69b8cc4715c795d Mon Sep 17 00:00:00 2001 From: Sven Date: Thu, 2 Jan 2020 19:08:03 -0500 Subject: [PATCH] PG unify/cleanup tf vs torch and PG functionality test cases (tf + torch). (#6650) * Unifying the code for PGTrainer/Policy wrt tf vs torch. Adding loss function test cases for the PGAgent (confirm equivalence of tf and torch). * Fix LINT line-len errors. * Fix LINT errors. * Fix `tf_pg_policy` imports (formerly: `pg_policy`). * Rename tf_pg_... into pg_tf_... following __... convention, where ...=policy/loss/agent/trainer. Retire `PGAgent` class (use PGTrainer instead). * - Move PG test into agents/pg/tests directory. - All test cases will be located near the classes that are tested and then built into the Bazel/Travis test suite. * Moved post_process_advantages into pg.py (from pg_tf_policy.py), b/c the function is not a tf-specific one. * Fix remaining import errors for agents/pg/... * Fix circular dependency in pg imports. * Add pg tests to Jenkins test suite. --- ci/jenkins_tests/run_rllib_tests.sh | 3 + python/ray/tune/trainable.py | 7 +- rllib/agents/a3c/a3c.py | 2 +- rllib/agents/pg/__init__.py | 9 +- rllib/agents/pg/pg.py | 16 +-- rllib/agents/pg/pg_policy.py | 37 ------ rllib/agents/pg/pg_tf_policy.py | 30 +++++ rllib/agents/pg/pg_torch_policy.py | 35 ++++++ rllib/agents/pg/tests/__init__.py | 0 rllib/agents/pg/tests/test_pg.py | 108 ++++++++++++++++++ rllib/agents/pg/torch_pg_policy.py | 40 ------- rllib/evaluation/rollout_worker.py | 12 +- .../rock_paper_scissors_multiagent.py | 2 +- rllib/optimizers/async_samples_optimizer.py | 1 - rllib/optimizers/segment_tree.py | 1 - rllib/tests/agents/__init__.py | 3 + rllib/tests/agents/functionality/__init__.py | 3 + rllib/tests/test_external_multi_agent_env.py | 2 +- rllib/tests/test_io.py | 2 +- rllib/tests/test_multi_agent_env.py | 2 +- rllib/tests/test_nested_spaces.py | 2 +- 21 files changed, 215 insertions(+), 102 deletions(-) delete mode 100644 rllib/agents/pg/pg_policy.py create mode 100644 rllib/agents/pg/pg_tf_policy.py create mode 100644 rllib/agents/pg/pg_torch_policy.py create mode 100644 rllib/agents/pg/tests/__init__.py create mode 100644 rllib/agents/pg/tests/test_pg.py delete mode 100644 rllib/agents/pg/torch_pg_policy.py create mode 100644 rllib/tests/agents/__init__.py create mode 100644 rllib/tests/agents/functionality/__init__.py diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 593a4f6c2..f86ec51c2 100755 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -165,6 +165,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ --stop '{"training_iteration": 1}' \ --config '{"num_workers": 2}' +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/rllib/agents/pg/tests/test_pg.py + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output /ray/rllib/train.py \ --env CartPole-v0 \ diff --git a/python/ray/tune/trainable.py b/python/ray/tune/trainable.py index 9e8c830a0..98d529945 100644 --- a/python/ray/tune/trainable.py +++ b/python/ray/tune/trainable.py @@ -124,8 +124,13 @@ class Trainable(object): @classmethod def resource_help(cls, config): - """Returns a help string for configuring this trainable's resources.""" + """ + Args: + config (dict): The Trainer's config dict. + Returns: + str: A help string for configuring this trainable's resources. + """ return "" def current_ip(self): diff --git a/rllib/agents/a3c/a3c.py b/rllib/agents/a3c/a3c.py index d320b9636..9809ef464 100644 --- a/rllib/agents/a3c/a3c.py +++ b/rllib/agents/a3c/a3c.py @@ -12,7 +12,7 @@ from ray.rllib.optimizers import AsyncGradientsOptimizer DEFAULT_CONFIG = with_common_config({ # Size of rollout batch "sample_batch_size": 10, - # Use PyTorch as backend - no LSTM support + # Use PyTorch as framework - no LSTM support "use_pytorch": False, # GAE(gamma) parameter "lambda": 1.0, diff --git a/rllib/agents/pg/__init__.py b/rllib/agents/pg/__init__.py index eb11c99bf..ae4a55a81 100644 --- a/rllib/agents/pg/__init__.py +++ b/rllib/agents/pg/__init__.py @@ -1,6 +1,7 @@ from ray.rllib.agents.pg.pg import PGTrainer, DEFAULT_CONFIG -from ray.rllib.utils import renamed_agent +from ray.rllib.agents.pg.pg_tf_policy import pg_tf_loss, \ + post_process_advantages +from ray.rllib.agents.pg.pg_torch_policy import pg_torch_loss -PGAgent = renamed_agent(PGTrainer) - -__all__ = ["PGAgent", "PGTrainer", "DEFAULT_CONFIG"] +__all__ = ["PGTrainer", "pg_tf_loss", "pg_torch_loss", + "post_process_advantages", "DEFAULT_CONFIG"] diff --git a/rllib/agents/pg/pg.py b/rllib/agents/pg/pg.py index 71e2ab3fb..05bdba7e9 100644 --- a/rllib/agents/pg/pg.py +++ b/rllib/agents/pg/pg.py @@ -1,20 +1,16 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - from ray.rllib.agents.trainer import with_common_config from ray.rllib.agents.trainer_template import build_trainer -from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy # yapf: disable # __sphinx_doc_begin__ DEFAULT_CONFIG = with_common_config({ - # No remote workers by default + # No remote workers by default. "num_workers": 0, - # Learning rate + # Learning rate. "lr": 0.0004, - # Use PyTorch as backend - "use_pytorch": False, + # Use PyTorch as framework? + "use_pytorch": False }) # __sphinx_doc_end__ # yapf: enable @@ -22,7 +18,7 @@ DEFAULT_CONFIG = with_common_config({ def get_policy_class(config): if config["use_pytorch"]: - from ray.rllib.agents.pg.torch_pg_policy import PGTorchPolicy + from ray.rllib.agents.pg.pg_torch_policy import PGTorchPolicy return PGTorchPolicy else: return PGTFPolicy diff --git a/rllib/agents/pg/pg_policy.py b/rllib/agents/pg/pg_policy.py deleted file mode 100644 index 1b8f6a4b6..000000000 --- a/rllib/agents/pg/pg_policy.py +++ /dev/null @@ -1,37 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import ray -from ray.rllib.evaluation.postprocessing import compute_advantages, \ - Postprocessing -from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils import try_import_tf - -tf = try_import_tf() - - -# The basic policy gradients loss -def policy_gradient_loss(policy, model, dist_class, train_batch): - logits, _ = model.from_batch(train_batch) - action_dist = dist_class(logits, model) - return -tf.reduce_mean( - action_dist.logp(train_batch[SampleBatch.ACTIONS]) * - train_batch[Postprocessing.ADVANTAGES]) - - -# This adds the "advantages" column to the sampletrain_batch. -def postprocess_advantages(policy, - sample_batch, - other_agent_batches=None, - episode=None): - return compute_advantages( - sample_batch, 0.0, policy.config["gamma"], use_gae=False) - - -PGTFPolicy = build_tf_policy( - name="PGTFPolicy", - get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, - postprocess_fn=postprocess_advantages, - loss_fn=policy_gradient_loss) diff --git a/rllib/agents/pg/pg_tf_policy.py b/rllib/agents/pg/pg_tf_policy.py new file mode 100644 index 000000000..262836db5 --- /dev/null +++ b/rllib/agents/pg/pg_tf_policy.py @@ -0,0 +1,30 @@ +import ray +from ray.rllib.evaluation.postprocessing import Postprocessing, \ + compute_advantages +from ray.rllib.policy.tf_policy_template import build_tf_policy +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + + +def post_process_advantages(policy, sample_batch, other_agent_batches=None, + episode=None): + """This adds the "advantages" column to the sample train_batch.""" + return compute_advantages(sample_batch, 0.0, policy.config["gamma"], + use_gae=False) + + +def pg_tf_loss(policy, model, dist_class, train_batch): + """The basic policy gradients loss.""" + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + return -tf.reduce_mean(action_dist.logp(train_batch[SampleBatch.ACTIONS]) + * train_batch[Postprocessing.ADVANTAGES]) + + +PGTFPolicy = build_tf_policy( + name="PGTFPolicy", + get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, + postprocess_fn=post_process_advantages, + loss_fn=pg_tf_loss) diff --git a/rllib/agents/pg/pg_torch_policy.py b/rllib/agents/pg/pg_torch_policy.py new file mode 100644 index 000000000..4f6e8f405 --- /dev/null +++ b/rllib/agents/pg/pg_torch_policy.py @@ -0,0 +1,35 @@ +import ray +from ray.rllib.agents.pg.pg_tf_policy import post_process_advantages +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.policy.torch_policy_template import build_torch_policy +from ray.rllib.utils.framework import try_import_torch + +torch, _ = try_import_torch() + + +def pg_torch_loss(policy, model, dist_class, train_batch): + """The basic policy gradients loss.""" + logits, _ = model.from_batch(train_batch) + action_dist = dist_class(logits, model) + log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) + # Save the error in the policy object. + # policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot( + # log_probs.reshape(-1)) / len(log_probs) + policy.pi_err = -torch.mean( + log_probs * train_batch[Postprocessing.ADVANTAGES] + ) + return policy.pi_err + + +def pg_loss_stats(policy, train_batch): + """ The error is recorded when computing the loss.""" + return {"policy_loss": policy.pi_err.item()} + + +PGTorchPolicy = build_torch_policy( + name="PGTorchPolicy", + get_default_config=lambda: ray.rllib.agents.pg.pg.DEFAULT_CONFIG, + loss_fn=pg_torch_loss, + stats_fn=pg_loss_stats, + postprocess_fn=post_process_advantages) diff --git a/rllib/agents/pg/tests/__init__.py b/rllib/agents/pg/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/rllib/agents/pg/tests/test_pg.py b/rllib/agents/pg/tests/test_pg.py new file mode 100644 index 000000000..0971843a2 --- /dev/null +++ b/rllib/agents/pg/tests/test_pg.py @@ -0,0 +1,108 @@ +import numpy as np +import unittest + +import ray +import ray.rllib.agents.pg as pg +from ray.rllib.evaluation.postprocessing import Postprocessing +from ray.rllib.models.tf.tf_action_dist import Categorical +from ray.rllib.models.torch.torch_action_dist import TorchCategorical +from ray.rllib.policy.sample_batch import SampleBatch +from ray.rllib.utils import check, fc + + +class TestPG(unittest.TestCase): + + ray.init() + + def test_pg_compilation(self): + """Test whether a PGTrainer can be built with both frameworks.""" + config = pg.DEFAULT_CONFIG.copy() + config["num_workers"] = 0 # Run locally. + + # tf. + trainer = pg.PGTrainer(config=config, env="CartPole-v0") + + num_iterations = 2 + for i in range(num_iterations): + trainer.train() + + # Torch. + config["use_pytorch"] = True + trainer = pg.PGTrainer(config=config, env="CartPole-v0") + for i in range(num_iterations): + trainer.train() + + def test_pg_loss_functions(self): + """Tests the PG loss function math.""" + config = pg.DEFAULT_CONFIG.copy() + config["num_workers"] = 0 # Run locally. + config["eager"] = True + config["gamma"] = 0.99 + config["model"]["fcnet_hiddens"] = [10] + config["model"]["fcnet_activation"] = "linear" + + # Fake CartPole episode of n timesteps. + train_batch = { + SampleBatch.CUR_OBS: np.array([ + [0.1, 0.2, 0.3, 0.4], + [0.5, 0.6, 0.7, 0.8], + [0.9, 1.0, 1.1, 1.2] + ]), + SampleBatch.ACTIONS: np.array([0, 1, 1]), + SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), + SampleBatch.DONES: np.array([False, False, True]) + } + + # tf. + trainer = pg.PGTrainer(config=config, env="CartPole-v0") + policy = trainer.get_policy() + vars = policy.model.trainable_variables() + + # Post-process (calculate simple (non-GAE) advantages) and attach to + # train_batch dict. + # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = + # [2.9701, 1.99, 1.0] + train_batch = pg.post_process_advantages(policy, train_batch) + # Check Advantage values. + check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) + + # Actual loss results. + results = pg.pg_tf_loss( + policy, policy.model, dist_class=Categorical, + train_batch=train_batch + ) + + # Calculate expected results. + expected_logits = fc( + fc( + train_batch[SampleBatch.CUR_OBS], + vars[0].numpy(), vars[1].numpy() + ), + vars[2].numpy(), vars[3].numpy() + ) + expected_logp = Categorical(expected_logits, policy.model).logp( + train_batch[SampleBatch.ACTIONS] + ) + expected_loss = -np.mean( + expected_logp * train_batch[Postprocessing.ADVANTAGES] + ) + check(results.numpy(), expected_loss, decimals=4) + + # Torch. + config["use_pytorch"] = True + trainer = pg.PGTrainer(config=config, env="CartPole-v0") + policy = trainer.get_policy() + train_batch = policy._lazy_tensor_dict(train_batch) + results = pg.pg_torch_loss( + policy, policy.model, dist_class=TorchCategorical, + train_batch=train_batch + ) + expected_logits = policy.model._last_output + expected_logp = TorchCategorical(expected_logits, policy.model).logp( + train_batch[SampleBatch.ACTIONS] + ) + expected_loss = -np.mean( + expected_logp.detach().numpy() * + train_batch[Postprocessing.ADVANTAGES].numpy() + ) + check(results.detach().numpy(), expected_loss, decimals=4) diff --git a/rllib/agents/pg/torch_pg_policy.py b/rllib/agents/pg/torch_pg_policy.py deleted file mode 100644 index 362a86b9e..000000000 --- a/rllib/agents/pg/torch_pg_policy.py +++ /dev/null @@ -1,40 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import ray -from ray.rllib.evaluation.postprocessing import compute_advantages, \ - Postprocessing -from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.torch_policy_template import build_torch_policy - - -def pg_torch_loss(policy, model, dist_class, train_batch): - logits, _ = model.from_batch(train_batch) - action_dist = dist_class(logits, model) - log_probs = action_dist.logp(train_batch[SampleBatch.ACTIONS]) - # save the error in the policy object - policy.pi_err = -train_batch[Postprocessing.ADVANTAGES].dot( - log_probs.reshape(-1)) - return policy.pi_err - - -def postprocess_advantages(policy, - sample_batch, - other_agent_batches=None, - episode=None): - return compute_advantages( - sample_batch, 0.0, policy.config["gamma"], use_gae=False) - - -def pg_loss_stats(policy, train_batch): - # the error is recorded when computing the loss - return {"policy_loss": policy.pi_err.item()} - - -PGTorchPolicy = build_torch_policy( - name="PGTorchPolicy", - get_default_config=lambda: ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, - loss_fn=pg_torch_loss, - stats_fn=pg_loss_stats, - postprocess_fn=postprocess_advantages) diff --git a/rllib/evaluation/rollout_worker.py b/rllib/evaluation/rollout_worker.py index f40521ddd..2d2a42899 100644 --- a/rllib/evaluation/rollout_worker.py +++ b/rllib/evaluation/rollout_worker.py @@ -666,10 +666,18 @@ class RolloutWorker(EvaluatorInterface): @DeveloperAPI def foreach_trainable_policy(self, func): - """Apply the given function to each (policy, policy_id) tuple. + """ + Applies the given function to each (policy, policy_id) tuple, which + can be found in `self.policies_to_train`. - This only applies func to policies in `self.policies_to_train`.""" + Args: + func (callable): A function - taking a Policy and its ID - that is + called on all Policies within `self.policies_to_train`. + Returns: + List[any]: The list of n return values of all + `func([policy], [ID])`-calls. + """ return [ func(policy, pid) for pid, policy in self.policy_map.items() if pid in self.policies_to_train diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py index eefa8085a..411d419b1 100644 --- a/rllib/examples/rock_paper_scissors_multiagent.py +++ b/rllib/examples/rock_paper_scissors_multiagent.py @@ -15,7 +15,7 @@ from gym.spaces import Discrete from ray import tune from ray.rllib.agents.pg.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy from ray.rllib.policy.policy import Policy from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.utils import try_import_tf diff --git a/rllib/optimizers/async_samples_optimizer.py b/rllib/optimizers/async_samples_optimizer.py index d102b7870..8eab14f3f 100644 --- a/rllib/optimizers/async_samples_optimizer.py +++ b/rllib/optimizers/async_samples_optimizer.py @@ -26,7 +26,6 @@ class AsyncSamplesOptimizer(PolicyOptimizer): This class coordinates the data transfers between the learner thread and remote workers (IMPALA actors). """ - def __init__(self, workers, train_batch_size=500, diff --git a/rllib/optimizers/segment_tree.py b/rllib/optimizers/segment_tree.py index 64e8b29f7..8f0a0b5a5 100644 --- a/rllib/optimizers/segment_tree.py +++ b/rllib/optimizers/segment_tree.py @@ -142,5 +142,4 @@ class MinSegmentTree(SegmentTree): def min(self, start=0, end=None): """Returns min(arr[start], ..., arr[end])""" - return super(MinSegmentTree, self).reduce(start, end) diff --git a/rllib/tests/agents/__init__.py b/rllib/tests/agents/__init__.py new file mode 100644 index 000000000..878841c18 --- /dev/null +++ b/rllib/tests/agents/__init__.py @@ -0,0 +1,3 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/rllib/tests/agents/functionality/__init__.py b/rllib/tests/agents/functionality/__init__.py new file mode 100644 index 000000000..878841c18 --- /dev/null +++ b/rllib/tests/agents/functionality/__init__.py @@ -0,0 +1,3 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function diff --git a/rllib/tests/test_external_multi_agent_env.py b/rllib/tests/test_external_multi_agent_env.py index be232c0bf..91a087c6b 100644 --- a/rllib/tests/test_external_multi_agent_env.py +++ b/rllib/tests/test_external_multi_agent_env.py @@ -8,7 +8,7 @@ import random import unittest import ray -from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.evaluation.worker_set import WorkerSet diff --git a/rllib/tests/test_io.py b/rllib/tests/test_io.py index 359be1404..e189dcce3 100644 --- a/rllib/tests/test_io.py +++ b/rllib/tests/test_io.py @@ -15,7 +15,7 @@ import unittest import ray from ray.rllib.agents.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy from ray.rllib.evaluation import SampleBatch from ray.rllib.offline import IOContext, JsonWriter, JsonReader from ray.rllib.offline.json_writer import _to_json diff --git a/rllib/tests/test_multi_agent_env.py b/rllib/tests/test_multi_agent_env.py index e69ba6b1f..17e333f07 100644 --- a/rllib/tests/test_multi_agent_env.py +++ b/rllib/tests/test_multi_agent_env.py @@ -8,7 +8,7 @@ import unittest import ray from ray.rllib.agents.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer, AsyncGradientsOptimizer) diff --git a/rllib/tests/test_nested_spaces.py b/rllib/tests/test_nested_spaces.py index d3283d54d..1127565dd 100644 --- a/rllib/tests/test_nested_spaces.py +++ b/rllib/tests/test_nested_spaces.py @@ -12,7 +12,7 @@ import unittest import ray from ray.rllib.agents.a3c import A2CTrainer from ray.rllib.agents.pg import PGTrainer -from ray.rllib.agents.pg.pg_policy import PGTFPolicy +from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy from ray.rllib.env import MultiAgentEnv from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.vector_env import VectorEnv