mirror of
https://github.com/wassname/ray.git
synced 2026-06-29 18:27:03 +08:00
[rllib] Add evaluation option to DQN agent (#3835)
* add eval * interval * multiagent minor fix * Update rllib.rst * Update ddpg.py * Update qmix.py
This commit is contained in:
@@ -440,7 +440,10 @@ class Agent(Trainable):
|
||||
self.local_evaluator.set_weights(weights)
|
||||
|
||||
@DeveloperAPI
|
||||
def make_local_evaluator(self, env_creator, policy_graph):
|
||||
def make_local_evaluator(self,
|
||||
env_creator,
|
||||
policy_graph,
|
||||
extra_config=None):
|
||||
"""Convenience method to return configured local evaluator."""
|
||||
|
||||
return self._make_evaluator(
|
||||
@@ -448,11 +451,14 @@ class Agent(Trainable):
|
||||
env_creator,
|
||||
policy_graph,
|
||||
0,
|
||||
# important: allow local tf to use more CPUs for optimization
|
||||
merge_dicts(self.config, {
|
||||
"tf_session_args": self.
|
||||
config["local_evaluator_tf_session_args"]
|
||||
}))
|
||||
merge_dicts(
|
||||
# important: allow local tf to use more CPUs for optimization
|
||||
merge_dicts(
|
||||
self.config, {
|
||||
"tf_session_args": self.
|
||||
config["local_evaluator_tf_session_args"]
|
||||
}),
|
||||
extra_config or {}))
|
||||
|
||||
@DeveloperAPI
|
||||
def make_remote_evaluators(self, env_creator, policy_graph, count):
|
||||
|
||||
@@ -27,6 +27,15 @@ DEFAULT_CONFIG = with_common_config({
|
||||
# target noise limit (bound)
|
||||
"noise_clip": 0.5,
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
# Note that evaluation is currently not parallelized, and that for Ape-X
|
||||
# metrics are already only reported for the lowest epsilon workers.
|
||||
"evaluation_interval": None,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# === Model ===
|
||||
# Hidden layer sizes of the policy network
|
||||
"actor_hiddens": [64, 64],
|
||||
|
||||
@@ -2,14 +2,18 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
from ray.rllib import optimizers
|
||||
from ray.rllib.agents.agent import Agent, with_common_config
|
||||
from ray.rllib.agents.dqn.dqn_policy_graph import DQNPolicyGraph
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.schedules import ConstantSchedule, LinearSchedule
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OPTIMIZER_SHARED_CONFIGS = [
|
||||
"buffer_size", "prioritized_replay", "prioritized_replay_alpha",
|
||||
"prioritized_replay_beta", "schedule_max_timesteps",
|
||||
@@ -41,6 +45,15 @@ DEFAULT_CONFIG = with_common_config({
|
||||
# N-step Q learning
|
||||
"n_step": 1,
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
# Note that evaluation is currently not parallelized, and that for Ape-X
|
||||
# metrics are already only reported for the lowest epsilon workers.
|
||||
"evaluation_interval": None,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
@@ -145,6 +158,16 @@ class DQNAgent(Agent):
|
||||
self.local_evaluator = self.make_local_evaluator(
|
||||
self.env_creator, self._policy_graph)
|
||||
|
||||
if self.config["evaluation_interval"]:
|
||||
self.evaluation_ev = self.make_local_evaluator(
|
||||
self.env_creator,
|
||||
self._policy_graph,
|
||||
extra_config={
|
||||
"batch_mode": "complete_episodes",
|
||||
"batch_steps": 1,
|
||||
})
|
||||
self.evaluation_metrics = self._evaluate()
|
||||
|
||||
def create_remote_evaluators():
|
||||
return self.make_remote_evaluators(self.env_creator,
|
||||
self._policy_graph,
|
||||
@@ -206,6 +229,12 @@ class DQNAgent(Agent):
|
||||
"max_exploration": max(exp_vals),
|
||||
"num_target_updates": self.num_target_updates,
|
||||
}, **self.optimizer.stats()))
|
||||
|
||||
if self.config["evaluation_interval"]:
|
||||
if self.iteration % self.config["evaluation_interval"] == 0:
|
||||
self.evaluation_metrics = self._evaluate()
|
||||
result.update(self.evaluation_metrics)
|
||||
|
||||
return result
|
||||
|
||||
def update_target_if_needed(self):
|
||||
@@ -220,6 +249,16 @@ class DQNAgent(Agent):
|
||||
def global_timestep(self):
|
||||
return self.optimizer.num_steps_sampled
|
||||
|
||||
def _evaluate(self):
|
||||
logger.info("Evaluating current policy for {} episodes".format(
|
||||
self.config["evaluation_num_episodes"]))
|
||||
self.evaluation_ev.restore(self.local_evaluator.save())
|
||||
self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0))
|
||||
for _ in range(self.config["evaluation_num_episodes"]):
|
||||
self.evaluation_ev.sample()
|
||||
metrics = collect_metrics(self.evaluation_ev)
|
||||
return {"evaluation": metrics}
|
||||
|
||||
def _make_exploration_schedule(self, worker_index):
|
||||
# Use either a different `eps` per worker, or a linear schedule.
|
||||
if self.config["per_worker_exploration"]:
|
||||
|
||||
@@ -104,9 +104,16 @@ class PPOAgent(Agent):
|
||||
self.local_evaluator.for_policy(
|
||||
lambda pi: pi.update_kl(fetches["kl"]))
|
||||
else:
|
||||
|
||||
def update(pi, pi_id):
|
||||
if pi_id in fetches:
|
||||
pi.update_kl(fetches[pi_id]["kl"])
|
||||
else:
|
||||
logger.debug(
|
||||
"No data for {}, not updating kl".format(pi_id))
|
||||
|
||||
# multi-agent
|
||||
self.local_evaluator.foreach_trainable_policy(
|
||||
lambda pi, pi_id: pi.update_kl(fetches[pi_id]["kl"]))
|
||||
self.local_evaluator.foreach_trainable_policy(update)
|
||||
res = self.optimizer.collect_metrics(
|
||||
self.config["collect_metrics_timeout"])
|
||||
res.update(
|
||||
|
||||
@@ -19,6 +19,15 @@ DEFAULT_CONFIG = with_common_config({
|
||||
# Optimize over complete episodes by default.
|
||||
"batch_mode": "complete_episodes",
|
||||
|
||||
# === Evaluation ===
|
||||
# Evaluate with epsilon=0 every `evaluation_interval` training iterations.
|
||||
# The evaluation stats will be reported under the "evaluation" metric key.
|
||||
# Note that evaluation is currently not parallelized, and that for Ape-X
|
||||
# metrics are already only reported for the lowest epsilon workers.
|
||||
"evaluation_interval": None,
|
||||
# Number of episodes to run per evaluation period.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# === Exploration ===
|
||||
# Max num timesteps for annealing schedules. Exploration is annealed from
|
||||
# 1.0 to exploration_fraction over this number of timesteps scaled by
|
||||
|
||||
@@ -256,7 +256,8 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
policy_dict = _validate_and_canonicalize(policy_graph, self.env)
|
||||
self.policies_to_train = policies_to_train or list(policy_dict.keys())
|
||||
if _has_tensorflow_graph(policy_dict):
|
||||
if (ray.worker._mode() != ray.worker.LOCAL_MODE
|
||||
if (ray.is_initialized()
|
||||
and ray.worker._mode() != ray.worker.LOCAL_MODE
|
||||
and not ray.get_gpu_ids()):
|
||||
logger.info("Creating policy evaluation worker {}".format(
|
||||
worker_index) +
|
||||
@@ -591,6 +592,7 @@ class PolicyEvaluator(EvaluatorInterface):
|
||||
preprocessors = {}
|
||||
for name, (cls, obs_space, act_space,
|
||||
conf) in sorted(policy_dict.items()):
|
||||
logger.debug("Creating policy graph for {}".format(name))
|
||||
merged_conf = merge_dicts(policy_config, conf)
|
||||
if self.preprocessing_enabled:
|
||||
preprocessor = ModelCatalog.get_preprocessor_for_space(
|
||||
|
||||
@@ -4,6 +4,8 @@ from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.dqn import DQNAgent
|
||||
from ray.rllib.agents.dqn.dqn_policy_graph import _adjust_nstep
|
||||
|
||||
|
||||
@@ -22,6 +24,21 @@ class DQNTest(unittest.TestCase):
|
||||
self.assertEqual(rewards,
|
||||
[91.0, 171.0, 271.0, 271.0, 271.0, 190.0, 100.0])
|
||||
|
||||
def testEvaluationOption(self):
|
||||
ray.init()
|
||||
agent = DQNAgent(env="CartPole-v0", config={"evaluation_interval": 2})
|
||||
r0 = agent.train()
|
||||
r1 = agent.train()
|
||||
r2 = agent.train()
|
||||
r3 = agent.train()
|
||||
r4 = agent.train()
|
||||
self.assertTrue("evaluation" in r0)
|
||||
self.assertTrue("episode_reward_mean" in r0["evaluation"])
|
||||
self.assertEqual(r0["evaluation"], r1["evaluation"])
|
||||
self.assertNotEqual(r1["evaluation"], r2["evaluation"])
|
||||
self.assertEqual(r2["evaluation"], r3["evaluation"])
|
||||
self.assertNotEqual(r3["evaluation"], r4["evaluation"])
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main(verbosity=2)
|
||||
|
||||
Reference in New Issue
Block a user