[rllib] Add evaluation option to DQN agent (#3835)

* add eval

* interval

* multiagent minor fix

* Update rllib.rst

* Update ddpg.py

* Update qmix.py
This commit is contained in:
Eric Liang
2019-01-29 21:19:53 -08:00
committed by GitHub
parent c45b91dcca
commit 152375aa8a
8 changed files with 99 additions and 9 deletions
+12 -6
View File
@@ -440,7 +440,10 @@ class Agent(Trainable):
self.local_evaluator.set_weights(weights)
@DeveloperAPI
def make_local_evaluator(self, env_creator, policy_graph):
def make_local_evaluator(self,
env_creator,
policy_graph,
extra_config=None):
"""Convenience method to return configured local evaluator."""
return self._make_evaluator(
@@ -448,11 +451,14 @@ class Agent(Trainable):
env_creator,
policy_graph,
0,
# important: allow local tf to use more CPUs for optimization
merge_dicts(self.config, {
"tf_session_args": self.
config["local_evaluator_tf_session_args"]
}))
merge_dicts(
# important: allow local tf to use more CPUs for optimization
merge_dicts(
self.config, {
"tf_session_args": self.
config["local_evaluator_tf_session_args"]
}),
extra_config or {}))
@DeveloperAPI
def make_remote_evaluators(self, env_creator, policy_graph, count):
+9
View File
@@ -27,6 +27,15 @@ DEFAULT_CONFIG = with_common_config({
# target noise limit (bound)
"noise_clip": 0.5,
# === Evaluation ===
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that evaluation is currently not parallelized, and that for Ape-X
# metrics are already only reported for the lowest epsilon workers.
"evaluation_interval": None,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 10,
# === Model ===
# Hidden layer sizes of the policy network
"actor_hiddens": [64, 64],
+39
View File
@@ -2,14 +2,18 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import time
from ray.rllib import optimizers
from ray.rllib.agents.agent import Agent, with_common_config
from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.utils.annotations import override
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
logger = logging.getLogger(__name__)
OPTIMIZER_SHARED_CONFIGS = [
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
"prioritized_replay_beta", "schedule_max_timesteps",
@@ -41,6 +45,15 @@ DEFAULT_CONFIG = with_common_config({
# N-step Q learning
"n_step": 1,
# === Evaluation ===
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that evaluation is currently not parallelized, and that for Ape-X
# metrics are already only reported for the lowest epsilon workers.
"evaluation_interval": None,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 10,
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
@@ -145,6 +158,16 @@ class DQNAgent(Agent):
self.local_evaluator = self.make_local_evaluator(
self.env_creator, self._policy_graph)
if self.config["evaluation_interval"]:
self.evaluation_ev = self.make_local_evaluator(
self.env_creator,
self._policy_graph,
extra_config={
"batch_mode": "complete_episodes",
"batch_steps": 1,
})
self.evaluation_metrics = self._evaluate()
def create_remote_evaluators():
return self.make_remote_evaluators(self.env_creator,
self._policy_graph,
@@ -206,6 +229,12 @@ class DQNAgent(Agent):
"max_exploration": max(exp_vals),
"num_target_updates": self.num_target_updates,
}, **self.optimizer.stats()))
if self.config["evaluation_interval"]:
if self.iteration % self.config["evaluation_interval"] == 0:
self.evaluation_metrics = self._evaluate()
result.update(self.evaluation_metrics)
return result
def update_target_if_needed(self):
@@ -220,6 +249,16 @@ class DQNAgent(Agent):
def global_timestep(self):
return self.optimizer.num_steps_sampled
def _evaluate(self):
logger.info("Evaluating current policy for {} episodes".format(
self.config["evaluation_num_episodes"]))
self.evaluation_ev.restore(self.local_evaluator.save())
self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0))
for _ in range(self.config["evaluation_num_episodes"]):
self.evaluation_ev.sample()
metrics = collect_metrics(self.evaluation_ev)
return {"evaluation": metrics}
def _make_exploration_schedule(self, worker_index):
# Use either a different `eps` per worker, or a linear schedule.
if self.config["per_worker_exploration"]:
+9 -2
View File
@@ -104,9 +104,16 @@ class PPOAgent(Agent):
self.local_evaluator.for_policy(
lambda pi: pi.update_kl(fetches["kl"]))
else:
def update(pi, pi_id):
if pi_id in fetches:
pi.update_kl(fetches[pi_id]["kl"])
else:
logger.debug(
"No data for {}, not updating kl".format(pi_id))
# multi-agent
self.local_evaluator.foreach_trainable_policy(
lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"]))
self.local_evaluator.foreach_trainable_policy(update)
res = self.optimizer.collect_metrics(
self.config["collect_metrics_timeout"])
res.update(
+9
View File
@@ -19,6 +19,15 @@ DEFAULT_CONFIG = with_common_config({
# Optimize over complete episodes by default.
"batch_mode": "complete_episodes",
# === Evaluation ===
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
# The evaluation stats will be reported under the "evaluation" metric key.
# Note that evaluation is currently not parallelized, and that for Ape-X
# metrics are already only reported for the lowest epsilon workers.
"evaluation_interval": None,
# Number of episodes to run per evaluation period.
"evaluation_num_episodes": 10,
# === Exploration ===
# Max num timesteps for annealing schedules. Exploration is annealed from
# 1.0 to exploration_fraction over this number of timesteps scaled by
@@ -256,7 +256,8 @@ class PolicyEvaluator(EvaluatorInterface):
policy_dict = _validate_and_canonicalize(policy_graph, self.env)
self.policies_to_train = policies_to_train or list(policy_dict.keys())
if _has_tensorflow_graph(policy_dict):
if (ray.worker._mode() != ray.worker.LOCAL_MODE
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE
and not ray.get_gpu_ids()):
logger.info("Creating policy evaluation worker {}".format(
worker_index) +
@@ -591,6 +592,7 @@ class PolicyEvaluator(EvaluatorInterface):
preprocessors = {}
for name, (cls, obs_space, act_space,
conf) in sorted(policy_dict.items()):
logger.debug("Creating policy graph for {}".format(name))
merged_conf = merge_dicts(policy_config, conf)
if self.preprocessing_enabled:
preprocessor = ModelCatalog.get_preprocessor_for_space(
+17
View File
@@ -4,6 +4,8 @@ from __future__ import print_function
import unittest
import ray
from ray.rllib.agents.dqn import DQNAgent
from ray.rllib.agents.dqn.dqn_policy_graph import _adjust_nstep
@@ -22,6 +24,21 @@ class DQNTest(unittest.TestCase):
self.assertEqual(rewards,
[91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
def testEvaluationOption(self):
ray.init()
agent = DQNAgent(env="CartPole-v0", config={"evaluation_interval": 2})
r0 = agent.train()
r1 = agent.train()
r2 = agent.train()
r3 = agent.train()
r4 = agent.train()
self.assertTrue("evaluation" in r0)
self.assertTrue("episode_reward_mean" in r0["evaluation"])
self.assertEqual(r0["evaluation"], r1["evaluation"])
self.assertNotEqual(r1["evaluation"], r2["evaluation"])
self.assertEqual(r2["evaluation"], r3["evaluation"])
self.assertNotEqual(r3["evaluation"], r4["evaluation"])
if __name__ == '__main__':
unittest.main(verbosity=2)