From d24f19fd1e41638b56c6feb135c011a50e48936a Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Sat, 7 Jul 2018 13:29:20 -0700 Subject: [PATCH] [rllib] Fix stats collection and some docs bugs since the refactoring (#2361) * fix * fix pbt example * fix * fix * single thread by default * vec * fix * fix --- doc/source/rllib-models.rst | 2 +- python/ray/rllib/{README.rst => README.md} | 4 ++-- python/ray/rllib/agents/a3c/a3c.py | 6 ++--- python/ray/rllib/agents/agent.py | 10 ++++++-- python/ray/rllib/agents/dqn/dqn.py | 12 ++++++++-- python/ray/rllib/agents/pg/pg.py | 6 ++--- python/ray/rllib/agents/ppo/ppo.py | 10 +++++--- python/ray/rllib/evaluation/metrics.py | 2 -- .../ray/rllib/optimizers/policy_optimizer.py | 5 ++++ .../optimizers/sync_samples_optimizer.py | 3 +-- .../ray/rllib/test/test_supported_spaces.py | 9 ++++++++ .../ray/rllib/tuned_examples/pong-apex.yaml | 2 ++ python/ray/tune/examples/pbt_ppo_example.py | 23 ++++++++----------- 13 files changed, 60 insertions(+), 34 deletions(-) rename python/ray/rllib/{README.rst => README.md} (83%) diff --git a/doc/source/rllib-models.rst b/doc/source/rllib-models.rst index 4978dd4e2..938824ec5 100644 --- a/doc/source/rllib-models.rst +++ b/doc/source/rllib-models.rst @@ -51,7 +51,7 @@ For a full example of a custom model in code, see the `Carla RLlib model `__ and registered in the model catalog: +Similarly, custom preprocessors should subclass the RLlib `preprocessor class `__ and be registered in the model catalog: .. code-block:: python diff --git a/python/ray/rllib/README.rst b/python/ray/rllib/README.md similarity index 83% rename from python/ray/rllib/README.rst rename to python/ray/rllib/README.md index 32571cf14..2dfb979f5 100644 --- a/python/ray/rllib/README.rst +++ b/python/ray/rllib/README.md @@ -3,9 +3,9 @@ RLlib: Scalable Reinforcement Learning RLlib is an open-source library for reinforcement learning that offers both a collection of reference algorithms and scalable primitives for composing new ones. -For an overview of RLlib, see the `documentation `__. +For an overview of RLlib, see the [documentation](http://ray.readthedocs.io/en/latest/rllib.html). -If you've found RLlib useful for your research, you can cite the `paper `__ as follows: +If you've found RLlib useful for your research, you can cite the [paper](https://arxiv.org/abs/1712.09381) as follows: ``` @inproceedings{liang2018rllib, diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py index d5f201459..264b70825 100644 --- a/python/ray/rllib/agents/a3c/a3c.py +++ b/python/ray/rllib/agents/a3c/a3c.py @@ -9,7 +9,6 @@ import ray from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.optimizers import AsyncGradientsOptimizer from ray.rllib.utils import FilterManager -from ray.rllib.evaluation.metrics import collect_metrics from ray.tune.trial import Resources DEFAULT_CONFIG = with_common_config({ @@ -98,12 +97,13 @@ class A3CAgent(Agent): self.config["optimizer"]) def _train(self): + prev_steps = self.optimizer.num_steps_sampled self.optimizer.step() FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) - result = collect_metrics(self.local_evaluator, self.remote_evaluators) + result = self.optimizer.collect_metrics() result = result._replace( - info=self.optimizer.stats()) + timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) return result def _stop(self): diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index 9739d1f64..b92fccb9e 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -39,8 +39,14 @@ COMMON_CONFIG = { "model": {}, # Arguments to pass to the rllib optimizer "optimizer": {}, - # Override default TF session args if non-empty - "tf_session_args": {}, + # Configure TF for single-process operation by default + "tf_session_args": { + "intra_op_parallelism_threads": 1, + "inter_op_parallelism_threads": 1, + "gpu_options": { + "allow_growth": True, + }, + }, # Whether to LZ4 compress observations "compress_observations": False, diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py index f60b43fbe..adb4e427b 100644 --- a/python/ray/rllib/agents/dqn/dqn.py +++ b/python/ray/rllib/agents/dqn/dqn.py @@ -185,9 +185,17 @@ class DQNAgent(Agent): e.foreach_policy.remote(lambda p, _: p.set_epsilon(exp_val)) exp_vals.append(exp_val) - result = collect_metrics( - self.local_evaluator, self.remote_evaluators) + if self.config["per_worker_exploration"]: + # Only collect metrics from the third of workers with lowest eps + result = collect_metrics( + self.local_evaluator, + self.remote_evaluators[-len(self.remote_evaluators) // 3:]) + else: + result = collect_metrics( + self.local_evaluator, self.remote_evaluators) + return result._replace( + timesteps_this_iter=self.global_timestep - start_timestep, info=dict({ "min_exploration": min(exp_vals), "max_exploration": max(exp_vals), diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index b971c8126..0bd4c33b4 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -4,7 +4,6 @@ from __future__ import print_function from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.pg.pg_policy_graph import PGPolicyGraph -from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.optimizers import SyncSamplesOptimizer from ray.tune.trial import Resources @@ -49,6 +48,7 @@ class PGAgent(Agent): self.config["optimizer"]) def _train(self): + prev_steps = self.optimizer.num_steps_sampled self.optimizer.step() - return collect_metrics( - self.optimizer.local_evaluator, self.optimizer.remote_evaluators) + return self.optimizer.collect_metrics()._replace( + timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 0ed3e03be..a55e084ad 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -9,7 +9,6 @@ import pickle import ray from ray.rllib.agents import Agent, with_common_config from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicyGraph -from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.utils import FilterManager from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer from ray.tune.trial import Resources @@ -81,6 +80,8 @@ class PPOAgent(Agent): "timesteps_per_batch": self.config["timesteps_per_batch"]}) def _train(self): + prev_steps = self.optimizer.num_steps_sampled + def postprocess_samples(batch): # Divide by the maximum of value.std() and 1e-4 # to guard against the case where all values are equal @@ -92,6 +93,7 @@ class PPOAgent(Agent): if not self.config["use_gae"]: batch.data["value_targets"] = dummy batch.data["vf_preds"] = dummy + extra_fetches = self.optimizer.step(postprocess_fn=postprocess_samples) kl = np.array(extra_fetches["kl"]).mean(axis=1)[-1] total_loss = np.array(extra_fetches["total_loss"]).mean(axis=1)[-1] @@ -112,8 +114,10 @@ class PPOAgent(Agent): FilterManager.synchronize( self.local_evaluator.filters, self.remote_evaluators) - res = collect_metrics(self.local_evaluator, self.remote_evaluators) - res = res._replace(info=info) + res = self.optimizer.collect_metrics() + res = res._replace( + timesteps_this_iter=self.optimizer.num_steps_sampled - prev_steps, + info=dict(info, **res.info)) return res def _stop(self): diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py index 5c5f0cfba..b3bc415a3 100644 --- a/python/ray/rllib/evaluation/metrics.py +++ b/python/ray/rllib/evaluation/metrics.py @@ -33,7 +33,6 @@ def collect_metrics(local_evaluator, remote_evaluators=[]): max_reward = float('nan') avg_reward = np.mean(episode_rewards) avg_length = np.mean(episode_lengths) - timesteps = np.sum(episode_lengths) for policy_id, rewards in policy_rewards.copy().items(): policy_rewards[policy_id] = np.mean(rewards) @@ -44,5 +43,4 @@ def collect_metrics(local_evaluator, remote_evaluators=[]): episode_reward_mean=avg_reward, episode_len_mean=avg_length, episodes_total=len(episode_lengths), - timesteps_this_iter=timesteps, policy_reward_mean=dict(policy_rewards)) diff --git a/python/ray/rllib/optimizers/policy_optimizer.py b/python/ray/rllib/optimizers/policy_optimizer.py index 5d78e5e82..c95136829 100644 --- a/python/ray/rllib/optimizers/policy_optimizer.py +++ b/python/ray/rllib/optimizers/policy_optimizer.py @@ -3,6 +3,7 @@ from __future__ import division from __future__ import print_function import ray +from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.evaluation.sample_batch import MultiAgentBatch @@ -104,6 +105,10 @@ class PolicyOptimizer(object): for i, ev in enumerate(self.remote_evaluators)]) return local_result + remote_results + def collect_metrics(self): + res = collect_metrics(self.local_evaluator, self.remote_evaluators) + return res._replace(info=self.stats()) + def _check_not_multiagent(self, sample_batch): if isinstance(sample_batch, MultiAgentBatch): raise NotImplementedError( diff --git a/python/ray/rllib/optimizers/sync_samples_optimizer.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py index ba6eb4cef..5f4ba7164 100644 --- a/python/ray/rllib/optimizers/sync_samples_optimizer.py +++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py @@ -17,12 +17,11 @@ class SyncSamplesOptimizer(PolicyOptimizer): model weights are then broadcast to all remote evaluators. """ - def _init(self, batch_size=32): + def _init(self): self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() self.grad_timer = TimerStat() self.throughput = RunningStat() - self.batch_size = batch_size def step(self): with self.update_weights_timer: diff --git a/python/ray/rllib/test/test_supported_spaces.py b/python/ray/rllib/test/test_supported_spaces.py index cb14fa93b..1189168e8 100644 --- a/python/ray/rllib/test/test_supported_spaces.py +++ b/python/ray/rllib/test/test_supported_spaces.py @@ -5,6 +5,7 @@ import gym from gym.spaces import Box, Discrete, Tuple from gym.envs.registration import EnvSpec import numpy as np +import sys import ray from ray.rllib.agents.agent import get_agent_class @@ -117,4 +118,12 @@ class ModelSupportedSpaces(unittest.TestCase): if __name__ == "__main__": + if len(sys.argv) > 1 and sys.argv[1] == "--smoke": + ACTION_SPACES_TO_TEST = { + "discrete": Discrete(5), + } + OBSERVATION_SPACES_TO_TEST = { + "vector": Box(0.0, 1.0, (5,), dtype=np.float32), + "atari": Box(0.0, 1.0, (210, 160, 3), dtype=np.float32), + } unittest.main(verbosity=2) diff --git a/python/ray/rllib/tuned_examples/pong-apex.yaml b/python/ray/rllib/tuned_examples/pong-apex.yaml index 27b45f288..f2955da6c 100644 --- a/python/ray/rllib/tuned_examples/pong-apex.yaml +++ b/python/ray/rllib/tuned_examples/pong-apex.yaml @@ -7,5 +7,7 @@ pong-apex: config: target_network_update_freq: 50000 num_workers: 32 + ## can also enable vectorization within processes + # num_envs: 4 lr: .0001 gamma: 0.99 diff --git a/python/ray/tune/examples/pbt_ppo_example.py b/python/ray/tune/examples/pbt_ppo_example.py index 991499986..24dd9acff 100755 --- a/python/ray/tune/examples/pbt_ppo_example.py +++ b/python/ray/tune/examples/pbt_ppo_example.py @@ -52,28 +52,23 @@ if __name__ == "__main__": "env": "Humanoid-v1", "repeat": 8, "config": { - "kl_coeff": - 1.0, - "num_workers": - 8, - "devices": ["/gpu:0"], + "kl_coeff": 1.0, + "num_workers": 8, + "num_gpus": 1, "model": { "free_log_std": True }, # These params are tuned from a fixed starting value. - "lambda": - 0.95, - "clip_param": - 0.2, - "sgd_stepsize": - 1e-4, + "lambda": 0.95, + "clip_param": 0.2, + "sgd_stepsize": 1e-4, # These params start off randomly drawn from a set. "num_sgd_iter": - lambda spec: random.choice([10, 20, 30]), + lambda spec: random.choice([10, 20, 30]), "sgd_batchsize": - lambda spec: random.choice([128, 512, 2048]), + lambda spec: random.choice([128, 512, 2048]), "timesteps_per_batch": - lambda spec: random.choice([10000, 20000, 40000]) + lambda spec: random.choice([10000, 20000, 40000]) }, }, },