diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 273c6fbcd..5f433e2b2 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -288,6 +288,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_local.py + +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_reproducibility.py docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_dependency.py diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py index 965014c32..1d707649d 100644 --- a/python/ray/rllib/agents/trainer.py +++ b/python/ray/rllib/agents/trainer.py @@ -193,6 +193,10 @@ COMMON_CONFIG = { # Minimum env steps to optimize for per train call. This value does # not affect learning, only the length of iterations. "timesteps_per_iteration": 0, + # This argument, in conjunction with worker_index, sets the random seed of + # each worker, so that identically configured trials will have identical + # results. This makes experiments reproducible. + "seed": None, # === Offline Datasets === # Specify how to generate experiences: diff --git a/python/ray/rllib/evaluation/rollout_worker.py b/python/ray/rllib/evaluation/rollout_worker.py index 410718c0e..d3e97ec26 100644 --- a/python/ray/rllib/evaluation/rollout_worker.py +++ b/python/ray/rllib/evaluation/rollout_worker.py @@ -2,6 +2,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import random +import numpy as np import gym import logging import pickle @@ -130,6 +132,7 @@ class RolloutWorker(EvaluatorInterface): remote_worker_envs=False, remote_env_batch_wait_ms=0, soft_horizon=False, + seed=None, _fake_sampler=False): """Initialize a rollout worker. @@ -215,6 +218,8 @@ class RolloutWorker(EvaluatorInterface): step / reset and model inference perf. soft_horizon (bool): Calculate rewards but don't reset the environment when the horizon is hit. + seed (int): Set the seed of both np and tf to this value to + to ensure each remote worker has unique exploration behavior. _fake_sampler (bool): Use a fake (inf speed) sampler for testing. """ @@ -292,6 +297,10 @@ class RolloutWorker(EvaluatorInterface): self.tf_sess = None policy_dict = _validate_and_canonicalize(policy, self.env) self.policies_to_train = policies_to_train or list(policy_dict.keys()) + # set numpy and python seed + if seed is not None: + np.random.seed(seed) + random.seed(seed) if _has_tensorflow_graph(policy_dict): if (ray.is_initialized() and ray.worker._mode() != ray.worker.LOCAL_MODE @@ -309,6 +318,9 @@ class RolloutWorker(EvaluatorInterface): config=tf.ConfigProto( gpu_options=tf.GPUOptions(allow_growth=True))) with self.tf_sess.as_default(): + # set graph-level seed + if seed is not None: + tf.set_random_seed(seed) self.policy_map, self.preprocessors = \ self._build_policy_map(policy_dict, policy_config) else: diff --git a/python/ray/rllib/evaluation/worker_set.py b/python/ray/rllib/evaluation/worker_set.py index 90d3c13c2..58100d8b5 100644 --- a/python/ray/rllib/evaluation/worker_set.py +++ b/python/ray/rllib/evaluation/worker_set.py @@ -211,4 +211,6 @@ class WorkerSet(object): remote_worker_envs=config["remote_worker_envs"], remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], soft_horizon=config["soft_horizon"], + seed=(config["seed"] + worker_index) + if config["seed"] is not None else None, _fake_sampler=config.get("_fake_sampler", False)) diff --git a/python/ray/rllib/tests/test_reproducibility.py b/python/ray/rllib/tests/test_reproducibility.py new file mode 100644 index 000000000..1cc034759 --- /dev/null +++ b/python/ray/rllib/tests/test_reproducibility.py @@ -0,0 +1,68 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest + +import ray +from ray.rllib.agents.dqn import DQNTrainer +from ray.tune.registry import register_env +import numpy as np +import gym + + +class TestReproducibility(unittest.TestCase): + def testReproducingTrajectory(self): + class PickLargest(gym.Env): + def __init__(self): + self.observation_space = gym.spaces.Box( + low=float("-inf"), high=float("inf"), shape=(4, )) + self.action_space = gym.spaces.Discrete(4) + + def reset(self, **kwargs): + self.obs = np.random.randn(4) + return self.obs + + def step(self, action): + reward = self.obs[action] + return self.obs, reward, True, {} + + def env_creator(env_config): + return PickLargest() + + trajs = list() + for trial in range(3): + ray.init() + register_env("PickLargest", env_creator) + agent = DQNTrainer( + env="PickLargest", + config={"seed": 666 if trial in [0, 1] else 999}) + + trajectory = list() + for _ in range(8): + r = agent.train() + trajectory.append(r["episode_reward_max"]) + trajectory.append(r["episode_reward_min"]) + trajs.append(trajectory) + + ray.shutdown() + + # trial0 and trial1 use same seed and thus + # expect identical trajectories. + all_same = True + for v0, v1 in zip(trajs[0], trajs[1]): + if v0 != v1: + all_same = False + self.assertTrue(all_same) + + # trial1 and trial2 use different seeds and thus + # most rewards tend to be different. + diff_cnt = 0 + for v1, v2 in zip(trajs[1], trajs[2]): + if v1 != v2: + diff_cnt += 1 + self.assertTrue(diff_cnt > 8) + + +if __name__ == "__main__": + unittest.main(verbosity=2)