mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 16:02:03 +08:00
Enable seeding actors for reproducible experiments (#5197)
* enable graph-level worker-specific seed * lint checked * revised according to eric's suggestions * revised accordingly and added a test case * formated * Update test_reproducibility.py * Update trainer.py * Update rollout_worker.py * Update run_rllib_tests.sh * Update worker_set.py
This commit is contained in:
@@ -193,6 +193,10 @@ COMMON_CONFIG = {
|
||||
# Minimum env steps to optimize for per train call. This value does
|
||||
# not affect learning, only the length of iterations.
|
||||
"timesteps_per_iteration": 0,
|
||||
# This argument, in conjunction with worker_index, sets the random seed of
|
||||
# each worker, so that identically configured trials will have identical
|
||||
# results. This makes experiments reproducible.
|
||||
"seed": None,
|
||||
|
||||
# === Offline Datasets ===
|
||||
# Specify how to generate experiences:
|
||||
|
||||
@@ -2,6 +2,8 @@ from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import random
|
||||
import numpy as np
|
||||
import gym
|
||||
import logging
|
||||
import pickle
|
||||
@@ -130,6 +132,7 @@ class RolloutWorker(EvaluatorInterface):
|
||||
remote_worker_envs=False,
|
||||
remote_env_batch_wait_ms=0,
|
||||
soft_horizon=False,
|
||||
seed=None,
|
||||
_fake_sampler=False):
|
||||
"""Initialize a rollout worker.
|
||||
|
||||
@@ -215,6 +218,8 @@ class RolloutWorker(EvaluatorInterface):
|
||||
step / reset and model inference perf.
|
||||
soft_horizon (bool): Calculate rewards but don't reset the
|
||||
environment when the horizon is hit.
|
||||
seed (int): Set the seed of both np and tf to this value to
|
||||
to ensure each remote worker has unique exploration behavior.
|
||||
_fake_sampler (bool): Use a fake (inf speed) sampler for testing.
|
||||
"""
|
||||
|
||||
@@ -292,6 +297,10 @@ class RolloutWorker(EvaluatorInterface):
|
||||
self.tf_sess = None
|
||||
policy_dict = _validate_and_canonicalize(policy, self.env)
|
||||
self.policies_to_train = policies_to_train or list(policy_dict.keys())
|
||||
# set numpy and python seed
|
||||
if seed is not None:
|
||||
np.random.seed(seed)
|
||||
random.seed(seed)
|
||||
if _has_tensorflow_graph(policy_dict):
|
||||
if (ray.is_initialized()
|
||||
and ray.worker._mode() != ray.worker.LOCAL_MODE
|
||||
@@ -309,6 +318,9 @@ class RolloutWorker(EvaluatorInterface):
|
||||
config=tf.ConfigProto(
|
||||
gpu_options=tf.GPUOptions(allow_growth=True)))
|
||||
with self.tf_sess.as_default():
|
||||
# set graph-level seed
|
||||
if seed is not None:
|
||||
tf.set_random_seed(seed)
|
||||
self.policy_map, self.preprocessors = \
|
||||
self._build_policy_map(policy_dict, policy_config)
|
||||
else:
|
||||
|
||||
@@ -211,4 +211,6 @@ class WorkerSet(object):
|
||||
remote_worker_envs=config["remote_worker_envs"],
|
||||
remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
|
||||
soft_horizon=config["soft_horizon"],
|
||||
seed=(config["seed"] + worker_index)
|
||||
if config["seed"] is not None else None,
|
||||
_fake_sampler=config.get("_fake_sampler", False))
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import unittest
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.dqn import DQNTrainer
|
||||
from ray.tune.registry import register_env
|
||||
import numpy as np
|
||||
import gym
|
||||
|
||||
|
||||
class TestReproducibility(unittest.TestCase):
|
||||
def testReproducingTrajectory(self):
|
||||
class PickLargest(gym.Env):
|
||||
def __init__(self):
|
||||
self.observation_space = gym.spaces.Box(
|
||||
low=float("-inf"), high=float("inf"), shape=(4, ))
|
||||
self.action_space = gym.spaces.Discrete(4)
|
||||
|
||||
def reset(self, **kwargs):
|
||||
self.obs = np.random.randn(4)
|
||||
return self.obs
|
||||
|
||||
def step(self, action):
|
||||
reward = self.obs[action]
|
||||
return self.obs, reward, True, {}
|
||||
|
||||
def env_creator(env_config):
|
||||
return PickLargest()
|
||||
|
||||
trajs = list()
|
||||
for trial in range(3):
|
||||
ray.init()
|
||||
register_env("PickLargest", env_creator)
|
||||
agent = DQNTrainer(
|
||||
env="PickLargest",
|
||||
config={"seed": 666 if trial in [0, 1] else 999})
|
||||
|
||||
trajectory = list()
|
||||
for _ in range(8):
|
||||
r = agent.train()
|
||||
trajectory.append(r["episode_reward_max"])
|
||||
trajectory.append(r["episode_reward_min"])
|
||||
trajs.append(trajectory)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
# trial0 and trial1 use same seed and thus
|
||||
# expect identical trajectories.
|
||||
all_same = True
|
||||
for v0, v1 in zip(trajs[0], trajs[1]):
|
||||
if v0 != v1:
|
||||
all_same = False
|
||||
self.assertTrue(all_same)
|
||||
|
||||
# trial1 and trial2 use different seeds and thus
|
||||
# most rewards tend to be different.
|
||||
diff_cnt = 0
|
||||
for v1, v2 in zip(trajs[1], trajs[2]):
|
||||
if v1 != v2:
|
||||
diff_cnt += 1
|
||||
self.assertTrue(diff_cnt > 8)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
Reference in New Issue
Block a user