Enable seeding actors for reproducible experiments (#5197)

*  enable graph-level worker-specific seed

*  lint checked

*  revised according to eric's suggestions

*  revised accordingly and added a test case

*  formated

* Update test_reproducibility.py

* Update trainer.py

* Update rollout_worker.py

* Update run_rllib_tests.sh

* Update worker_set.py
This commit is contained in:
Jones Wong
2019-07-18 14:31:34 +08:00
committed by Eric Liang
parent 63f49f95dd
commit 0af07bd493
5 changed files with 89 additions and 0 deletions
+4
View File
@@ -193,6 +193,10 @@ COMMON_CONFIG = {
# Minimum env steps to optimize for per train call. This value does
# not affect learning, only the length of iterations.
"timesteps_per_iteration": 0,
# This argument, in conjunction with worker_index, sets the random seed of
# each worker, so that identically configured trials will have identical
# results. This makes experiments reproducible.
"seed": None,
# === Offline Datasets ===
# Specify how to generate experiences:
@@ -2,6 +2,8 @@ from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
import numpy as np
import gym
import logging
import pickle
@@ -130,6 +132,7 @@ class RolloutWorker(EvaluatorInterface):
remote_worker_envs=False,
remote_env_batch_wait_ms=0,
soft_horizon=False,
seed=None,
_fake_sampler=False):
"""Initialize a rollout worker.
@@ -215,6 +218,8 @@ class RolloutWorker(EvaluatorInterface):
step / reset and model inference perf.
soft_horizon (bool): Calculate rewards but don't reset the
environment when the horizon is hit.
seed (int): Set the seed of both np and tf to this value to
to ensure each remote worker has unique exploration behavior.
_fake_sampler (bool): Use a fake (inf speed) sampler for testing.
"""
@@ -292,6 +297,10 @@ class RolloutWorker(EvaluatorInterface):
self.tf_sess = None
policy_dict = _validate_and_canonicalize(policy, self.env)
self.policies_to_train = policies_to_train or list(policy_dict.keys())
# set numpy and python seed
if seed is not None:
np.random.seed(seed)
random.seed(seed)
if _has_tensorflow_graph(policy_dict):
if (ray.is_initialized()
and ray.worker._mode() != ray.worker.LOCAL_MODE
@@ -309,6 +318,9 @@ class RolloutWorker(EvaluatorInterface):
config=tf.ConfigProto(
gpu_options=tf.GPUOptions(allow_growth=True)))
with self.tf_sess.as_default():
# set graph-level seed
if seed is not None:
tf.set_random_seed(seed)
self.policy_map, self.preprocessors = \
self._build_policy_map(policy_dict, policy_config)
else:
@@ -211,4 +211,6 @@ class WorkerSet(object):
remote_worker_envs=config["remote_worker_envs"],
remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
soft_horizon=config["soft_horizon"],
seed=(config["seed"] + worker_index)
if config["seed"] is not None else None,
_fake_sampler=config.get("_fake_sampler", False))
@@ -0,0 +1,68 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import ray
from ray.rllib.agents.dqn import DQNTrainer
from ray.tune.registry import register_env
import numpy as np
import gym
class TestReproducibility(unittest.TestCase):
def testReproducingTrajectory(self):
class PickLargest(gym.Env):
def __init__(self):
self.observation_space = gym.spaces.Box(
low=float("-inf"), high=float("inf"), shape=(4, ))
self.action_space = gym.spaces.Discrete(4)
def reset(self, **kwargs):
self.obs = np.random.randn(4)
return self.obs
def step(self, action):
reward = self.obs[action]
return self.obs, reward, True, {}
def env_creator(env_config):
return PickLargest()
trajs = list()
for trial in range(3):
ray.init()
register_env("PickLargest", env_creator)
agent = DQNTrainer(
env="PickLargest",
config={"seed": 666 if trial in [0, 1] else 999})
trajectory = list()
for _ in range(8):
r = agent.train()
trajectory.append(r["episode_reward_max"])
trajectory.append(r["episode_reward_min"])
trajs.append(trajectory)
ray.shutdown()
# trial0 and trial1 use same seed and thus
# expect identical trajectories.
all_same = True
for v0, v1 in zip(trajs[0], trajs[1]):
if v0 != v1:
all_same = False
self.assertTrue(all_same)
# trial1 and trial2 use different seeds and thus
# most rewards tend to be different.
diff_cnt = 0
for v1, v2 in zip(trajs[1], trajs[2]):
if v1 != v2:
diff_cnt += 1
self.assertTrue(diff_cnt > 8)
if __name__ == "__main__":
unittest.main(verbosity=2)