From 152375aa8a5f08d678e4f7703d1576bb45590b46 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 29 Jan 2019 21:19:53 -0800 Subject: [PATCH] [rllib] Add evaluation option to DQN agent (#3835) * add eval * interval * multiagent minor fix * Update rllib.rst * Update ddpg.py * Update qmix.py --- doc/source/rllib.rst | 1 + python/ray/rllib/agents/agent.py | 18 ++++++--- python/ray/rllib/agents/ddpg/ddpg.py | 9 +++++ python/ray/rllib/agents/dqn/dqn.py | 39 +++++++++++++++++++ python/ray/rllib/agents/ppo/ppo.py | 11 +++++- python/ray/rllib/agents/qmix/qmix.py | 9 +++++ .../ray/rllib/evaluation/policy_evaluator.py | 4 +- python/ray/rllib/test/test_evaluators.py | 17 ++++++++ 8 files changed, 99 insertions(+), 9 deletions(-) diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index bcb4b605f..289558911 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -100,6 +100,7 @@ Development ----------- * `Development Install `__ +* `API Stability `__ * `Features `__ * `Benchmarks `__ * `Contributing Algorithms `__ diff --git a/python/ray/rllib/agents/agent.py b/python/ray/rllib/agents/agent.py index c45817ee9..8122e3491 100644 --- a/python/ray/rllib/agents/agent.py +++ b/python/ray/rllib/agents/agent.py @@ -440,7 +440,10 @@ class Agent(Trainable): self.local_evaluator.set_weights(weights) @DeveloperAPI - def make_local_evaluator(self, env_creator, policy_graph): + def make_local_evaluator(self, + env_creator, + policy_graph, + extra_config=None): """Convenience method to return configured local evaluator.""" return self._make_evaluator( @@ -448,11 +451,14 @@ class Agent(Trainable): env_creator, policy_graph, 0, - # important: allow local tf to use more CPUs for optimization - merge_dicts(self.config, { - "tf_session_args": self. - config["local_evaluator_tf_session_args"] - })) + merge_dicts( + # important: allow local tf to use more CPUs for optimization + merge_dicts( + self.config, { + "tf_session_args": self. + config["local_evaluator_tf_session_args"] + }), + extra_config or {})) @DeveloperAPI def make_remote_evaluators(self, env_creator, policy_graph, count): diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py index 04aba0e3e..e2cb92ab0 100644 --- a/python/ray/rllib/agents/ddpg/ddpg.py +++ b/python/ray/rllib/agents/ddpg/ddpg.py @@ -27,6 +27,15 @@ DEFAULT_CONFIG = with_common_config({ # target noise limit (bound) "noise_clip": 0.5, + # === Evaluation === + # Evaluate with epsilon=0 every `evaluation_interval` training iterations. + # The evaluation stats will be reported under the "evaluation" metric key. + # Note that evaluation is currently not parallelized, and that for Ape-X + # metrics are already only reported for the lowest epsilon workers. + "evaluation_interval": None, + # Number of episodes to run per evaluation period. + "evaluation_num_episodes": 10, + # === Model === # Hidden layer sizes of the policy network "actor_hiddens": [64, 64], diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py index e6d0263c8..c48f3dd69 100644 --- a/python/ray/rllib/agents/dqn/dqn.py +++ b/python/ray/rllib/agents/dqn/dqn.py @@ -2,14 +2,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import logging import time from ray.rllib import optimizers from ray.rllib.agents.agent import Agent, with_common_config from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph +from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.utils.annotations import override from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule +logger = logging.getLogger(__name__) + OPTIMIZER_SHARED_CONFIGS = [ "buffer_size", "prioritized_replay", "prioritized_replay_alpha", "prioritized_replay_beta", "schedule_max_timesteps", @@ -41,6 +45,15 @@ DEFAULT_CONFIG = with_common_config({ # N-step Q learning "n_step": 1, + # === Evaluation === + # Evaluate with epsilon=0 every `evaluation_interval` training iterations. + # The evaluation stats will be reported under the "evaluation" metric key. + # Note that evaluation is currently not parallelized, and that for Ape-X + # metrics are already only reported for the lowest epsilon workers. + "evaluation_interval": None, + # Number of episodes to run per evaluation period. + "evaluation_num_episodes": 10, + # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by @@ -145,6 +158,16 @@ class DQNAgent(Agent): self.local_evaluator = self.make_local_evaluator( self.env_creator, self._policy_graph) + if self.config["evaluation_interval"]: + self.evaluation_ev = self.make_local_evaluator( + self.env_creator, + self._policy_graph, + extra_config={ + "batch_mode": "complete_episodes", + "batch_steps": 1, + }) + self.evaluation_metrics = self._evaluate() + def create_remote_evaluators(): return self.make_remote_evaluators(self.env_creator, self._policy_graph, @@ -206,6 +229,12 @@ class DQNAgent(Agent): "max_exploration": max(exp_vals), "num_target_updates": self.num_target_updates, }, **self.optimizer.stats())) + + if self.config["evaluation_interval"]: + if self.iteration % self.config["evaluation_interval"] == 0: + self.evaluation_metrics = self._evaluate() + result.update(self.evaluation_metrics) + return result def update_target_if_needed(self): @@ -220,6 +249,16 @@ class DQNAgent(Agent): def global_timestep(self): return self.optimizer.num_steps_sampled + def _evaluate(self): + logger.info("Evaluating current policy for {} episodes".format( + self.config["evaluation_num_episodes"])) + self.evaluation_ev.restore(self.local_evaluator.save()) + self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0)) + for _ in range(self.config["evaluation_num_episodes"]): + self.evaluation_ev.sample() + metrics = collect_metrics(self.evaluation_ev) + return {"evaluation": metrics} + def _make_exploration_schedule(self, worker_index): # Use either a different `eps` per worker, or a linear schedule. if self.config["per_worker_exploration"]: diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index 59eba8ace..3e892e9de 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -104,9 +104,16 @@ class PPOAgent(Agent): self.local_evaluator.for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: + + def update(pi, pi_id): + if pi_id in fetches: + pi.update_kl(fetches[pi_id]["kl"]) + else: + logger.debug( + "No data for {}, not updating kl".format(pi_id)) + # multi-agent - self.local_evaluator.foreach_trainable_policy( - lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"])) + self.local_evaluator.foreach_trainable_policy(update) res = self.optimizer.collect_metrics( self.config["collect_metrics_timeout"]) res.update( diff --git a/python/ray/rllib/agents/qmix/qmix.py b/python/ray/rllib/agents/qmix/qmix.py index 2bc4c6b23..037f7b80c 100644 --- a/python/ray/rllib/agents/qmix/qmix.py +++ b/python/ray/rllib/agents/qmix/qmix.py @@ -19,6 +19,15 @@ DEFAULT_CONFIG = with_common_config({ # Optimize over complete episodes by default. "batch_mode": "complete_episodes", + # === Evaluation === + # Evaluate with epsilon=0 every `evaluation_interval` training iterations. + # The evaluation stats will be reported under the "evaluation" metric key. + # Note that evaluation is currently not parallelized, and that for Ape-X + # metrics are already only reported for the lowest epsilon workers. + "evaluation_interval": None, + # Number of episodes to run per evaluation period. + "evaluation_num_episodes": 10, + # === Exploration === # Max num timesteps for annealing schedules. Exploration is annealed from # 1.0 to exploration_fraction over this number of timesteps scaled by diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index cd0b97e5b..5902e63ec 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -256,7 +256,8 @@ class PolicyEvaluator(EvaluatorInterface): policy_dict = _validate_and_canonicalize(policy_graph, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) if _has_tensorflow_graph(policy_dict): - if (ray.worker._mode() != ray.worker.LOCAL_MODE + if (ray.is_initialized() + and ray.worker._mode() != ray.worker.LOCAL_MODE and not ray.get_gpu_ids()): logger.info("Creating policy evaluation worker {}".format( worker_index) + @@ -591,6 +592,7 @@ class PolicyEvaluator(EvaluatorInterface): preprocessors = {} for name, (cls, obs_space, act_space, conf) in sorted(policy_dict.items()): + logger.debug("Creating policy graph for {}".format(name)) merged_conf = merge_dicts(policy_config, conf) if self.preprocessing_enabled: preprocessor = ModelCatalog.get_preprocessor_for_space( diff --git a/python/ray/rllib/test/test_evaluators.py b/python/ray/rllib/test/test_evaluators.py index c7a72d7a5..ad76c7d4f 100644 --- a/python/ray/rllib/test/test_evaluators.py +++ b/python/ray/rllib/test/test_evaluators.py @@ -4,6 +4,8 @@ from __future__ import print_function import unittest +import ray +from ray.rllib.agents.dqn import DQNAgent from ray.rllib.agents.dqn.dqn_policy_graph import _adjust_nstep @@ -22,6 +24,21 @@ class DQNTest(unittest.TestCase): self.assertEqual(rewards, [91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0]) + def testEvaluationOption(self): + ray.init() + agent = DQNAgent(env="CartPole-v0", config={"evaluation_interval": 2}) + r0 = agent.train() + r1 = agent.train() + r2 = agent.train() + r3 = agent.train() + r4 = agent.train() + self.assertTrue("evaluation" in r0) + self.assertTrue("episode_reward_mean" in r0["evaluation"]) + self.assertEqual(r0["evaluation"], r1["evaluation"]) + self.assertNotEqual(r1["evaluation"], r2["evaluation"]) + self.assertEqual(r2["evaluation"], r3["evaluation"]) + self.assertNotEqual(r3["evaluation"], r4["evaluation"]) + if __name__ == '__main__': unittest.main(verbosity=2)