Enable seeding actors for reproducible experiments (#5197)

* enable graph-level worker-specific seed * lint checked * revised according to eric's suggestions * revised accordingly and added a test case * formated * Update test_reproducibility.py * Update trainer.py * Update rollout_worker.py * Update run_rllib_tests.sh * Update worker_set.py
2026-07-01 16:02:03 +08:00 · 2019-07-18 14:31:34 +08:00
parent 63f49f95dd
commit 0af07bd493
5 changed files with 89 additions and 0 deletions
@@ -193,6 +193,10 @@ COMMON_CONFIG = {
    # Minimum env steps to optimize for per train call. This value does
    # not affect learning, only the length of iterations.
    "timesteps_per_iteration": 0,
+    # This argument, in conjunction with worker_index, sets the random seed of
+    # each worker, so that identically configured trials will have identical
+    # results. This makes experiments reproducible.
+    "seed": None,

    # === Offline Datasets ===
    # Specify how to generate experiences:
@@ -2,6 +2,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

+import random
+import numpy as np
 import gym
 import logging
 import pickle
@@ -130,6 +132,7 @@ class RolloutWorker(EvaluatorInterface):
                 remote_worker_envs=False,
                 remote_env_batch_wait_ms=0,
                 soft_horizon=False,
+                 seed=None,
                 _fake_sampler=False):
        """Initialize a rollout worker.

@@ -215,6 +218,8 @@ class RolloutWorker(EvaluatorInterface):
                step / reset and model inference perf.
            soft_horizon (bool): Calculate rewards but don't reset the
                environment when the horizon is hit.
+            seed (int): Set the seed of both np and tf to this value to
+                to ensure each remote worker has unique exploration behavior.
            _fake_sampler (bool): Use a fake (inf speed) sampler for testing.
        """

@@ -292,6 +297,10 @@ class RolloutWorker(EvaluatorInterface):
        self.tf_sess = None
        policy_dict = _validate_and_canonicalize(policy, self.env)
        self.policies_to_train = policies_to_train or list(policy_dict.keys())
+        # set numpy and python seed
+        if seed is not None:
+            np.random.seed(seed)
+            random.seed(seed)
        if _has_tensorflow_graph(policy_dict):
            if (ray.is_initialized()
                    and ray.worker._mode() != ray.worker.LOCAL_MODE
@@ -309,6 +318,9 @@ class RolloutWorker(EvaluatorInterface):
                        config=tf.ConfigProto(
                            gpu_options=tf.GPUOptions(allow_growth=True)))
                with self.tf_sess.as_default():
+                    # set graph-level seed
+                    if seed is not None:
+                        tf.set_random_seed(seed)
                    self.policy_map, self.preprocessors = \
                        self._build_policy_map(policy_dict, policy_config)
        else:
@@ -211,4 +211,6 @@ class WorkerSet(object):
            remote_worker_envs=config["remote_worker_envs"],
            remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"],
            soft_horizon=config["soft_horizon"],
+            seed=(config["seed"] + worker_index)
+            if config["seed"] is not None else None,
            _fake_sampler=config.get("_fake_sampler", False))
@@ -0,0 +1,68 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import ray
+from ray.rllib.agents.dqn import DQNTrainer
+from ray.tune.registry import register_env
+import numpy as np
+import gym
+
+
+class TestReproducibility(unittest.TestCase):
+    def testReproducingTrajectory(self):
+        class PickLargest(gym.Env):
+            def __init__(self):
+                self.observation_space = gym.spaces.Box(
+                    low=float("-inf"), high=float("inf"), shape=(4, ))
+                self.action_space = gym.spaces.Discrete(4)
+
+            def reset(self, **kwargs):
+                self.obs = np.random.randn(4)
+                return self.obs
+
+            def step(self, action):
+                reward = self.obs[action]
+                return self.obs, reward, True, {}
+
+        def env_creator(env_config):
+            return PickLargest()
+
+        trajs = list()
+        for trial in range(3):
+            ray.init()
+            register_env("PickLargest", env_creator)
+            agent = DQNTrainer(
+                env="PickLargest",
+                config={"seed": 666 if trial in [0, 1] else 999})
+
+            trajectory = list()
+            for _ in range(8):
+                r = agent.train()
+                trajectory.append(r["episode_reward_max"])
+                trajectory.append(r["episode_reward_min"])
+            trajs.append(trajectory)
+
+            ray.shutdown()
+
+        # trial0 and trial1 use same seed and thus
+        # expect identical trajectories.
+        all_same = True
+        for v0, v1 in zip(trajs[0], trajs[1]):
+            if v0 != v1:
+                all_same = False
+        self.assertTrue(all_same)
+
+        # trial1 and trial2 use different seeds and thus
+        # most rewards tend to be different.
+        diff_cnt = 0
+        for v1, v2 in zip(trajs[1], trajs[2]):
+            if v1 != v2:
+                diff_cnt += 1
+        self.assertTrue(diff_cnt > 8)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)