diff --git a/ci/jenkins_tests/run_rllib_tests.sh b/ci/jenkins_tests/run_rllib_tests.sh index 78fbf6a3a..a97bf5517 100644 --- a/ci/jenkins_tests/run_rllib_tests.sh +++ b/ci/jenkins_tests/run_rllib_tests.sh @@ -302,7 +302,7 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_checkpoint_restore.py docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_policy_evaluator.py + /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_rollout_worker.py docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/tests/test_nested_spaces.py @@ -389,6 +389,9 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_loss.py --iters=2 +docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ + /ray/ci/suppress_output python /ray/python/ray/rllib/examples/rollout_worker_custom_workflow.py + docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_tf_policy.py --iters=2 @@ -396,7 +399,7 @@ docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_torch_policy.py --iters=2 docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ - /ray/ci/suppress_output python /ray/python/ray/rllib/examples/policy_evaluator_custom_workflow.py + /ray/ci/suppress_output python /ray/python/ray/rllib/examples/rollout_worker_custom_workflow.py docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} $DOCKER_SHA \ /ray/ci/suppress_output python /ray/python/ray/rllib/examples/custom_metrics_and_callbacks.py --num-iters=2 diff --git a/doc/source/rllib-concepts.rst b/doc/source/rllib-concepts.rst index 2f9603b69..b7b3ff823 100644 --- a/doc/source/rllib-concepts.rst +++ b/doc/source/rllib-concepts.rst @@ -453,7 +453,7 @@ Policy Evaluation Given an environment and policy, policy evaluation produces `batches `__ of experiences. This is your classic "environment interaction loop". Efficient policy evaluation can be burdensome to get right, especially when leveraging vectorization, RNNs, or when operating in a multi-agent environment. RLlib provides a `RolloutWorker `__ class that manages all of this, and this class is used in most RLlib algorithms. -You can use rollout workers standalone to produce batches of experiences. This can be done by calling ``worker.sample()`` on a worker instance, or ``worker.sample.remote()`` in parallel on worker instances created as Ray actors (see ``RolloutWorkers.create_remote``). +You can use rollout workers standalone to produce batches of experiences. This can be done by calling ``worker.sample()`` on a worker instance, or ``worker.sample.remote()`` in parallel on worker instances created as Ray actors (see `WorkerSet `__). Here is an example of creating a set of rollout workers and using them gather experiences in parallel. The trajectories are concatenated, the policy learns on the trajectory batch, and then we broadcast the policy weights to the workers for the next round of rollouts: diff --git a/doc/source/rllib-config.svg b/doc/source/rllib-config.svg index 04331f5f3..b3a011eee 100644 --- a/doc/source/rllib-config.svg +++ b/doc/source/rllib-config.svg @@ -1 +1 @@ - \ No newline at end of file + \ No newline at end of file diff --git a/doc/source/rllib-examples.rst b/doc/source/rllib-examples.rst index f26e078ea..13bfdc68b 100644 --- a/doc/source/rllib-examples.rst +++ b/doc/source/rllib-examples.rst @@ -22,7 +22,7 @@ Training Workflows Example of how to adjust the configuration of an environment over time. - `Custom metrics `__: Example of how to output custom training metrics to TensorBoard. -- `Using policy evaluators directly for control over the whole training workflow `__: +- `Using rollout workers directly for control over the whole training workflow `__: Example of how to use RLlib's lower-level building blocks to implement a fully customized training workflow. Custom Envs and Models diff --git a/doc/source/rllib-training.rst b/doc/source/rllib-training.rst index ef4f29295..824ef4c3d 100644 --- a/doc/source/rllib-training.rst +++ b/doc/source/rllib-training.rst @@ -178,13 +178,13 @@ Custom Training Workflows In the `basic training example `__, Tune will call ``train()`` on your trainer once per iteration and report the new training results. Sometimes, it is desirable to have full control over training, but still run inside Tune. Tune supports `custom trainable functions `__ that can be used to implement `custom training workflows (example) `__. -For even finer-grained control over training, you can use RLlib's lower-level `building blocks `__ directly to implement `fully customized training workflows `__. +For even finer-grained control over training, you can use RLlib's lower-level `building blocks `__ directly to implement `fully customized training workflows `__. Accessing Policy State ~~~~~~~~~~~~~~~~~~~~~~ -It is common to need to access a trainer's internal state, e.g., to set or get internal weights. In RLlib trainer state is replicated across multiple *policy evaluators* (Ray actors) in the cluster. However, you can easily get and update this state between calls to ``train()`` via ``trainer.optimizer.foreach_evaluator()`` or ``trainer.optimizer.foreach_evaluator_with_index()``. These functions take a lambda function that is applied with the evaluator as an arg. You can also return values from these functions and those will be returned as a list. +It is common to need to access a trainer's internal state, e.g., to set or get internal weights. In RLlib trainer state is replicated across multiple *rollout workers* (Ray actors) in the cluster. However, you can easily get and update this state between calls to ``train()`` via ``trainer.workers.foreach_worker()`` or ``trainer.workers.foreach_worker_with_index()``. These functions take a lambda function that is applied with the worker as an arg. You can also return values from these functions and those will be returned as a list. -You can also access just the "master" copy of the trainer state through ``trainer.get_policy()`` or ``trainer.local_evaluator``, but note that updates here may not be immediately reflected in remote replicas if you have configured ``num_workers > 0``. For example, to access the weights of a local TF policy, you can run ``trainer.get_policy().get_weights()``. This is also equivalent to ``trainer.local_evaluator.policy_map["default_policy"].get_weights()``: +You can also access just the "master" copy of the trainer state through ``trainer.get_policy()`` or ``trainer.workers.local_worker()``, but note that updates here may not be immediately reflected in remote replicas if you have configured ``num_workers > 0``. For example, to access the weights of a local TF policy, you can run ``trainer.get_policy().get_weights()``. This is also equivalent to ``trainer.workers.local_worker().policy_map["default_policy"].get_weights()``: .. code-block:: python @@ -192,13 +192,13 @@ You can also access just the "master" copy of the trainer state through ``traine trainer.get_policy().get_weights() # Same as above - trainer.local_evaluator.policy_map["default_policy"].get_weights() + trainer.workers.local_worker().policy_map["default_policy"].get_weights() - # Get list of weights of each evaluator, including remote replicas - trainer.optimizer.foreach_evaluator(lambda ev: ev.get_policy().get_weights()) + # Get list of weights of each worker, including remote replicas + trainer.workers.foreach_worker(lambda ev: ev.get_policy().get_weights()) # Same as above - trainer.optimizer.foreach_evaluator_with_index(lambda ev, i: ev.get_policy().get_weights()) + trainer.workers.foreach_worker_with_index(lambda ev, i: ev.get_policy().get_weights()) Global Coordination ~~~~~~~~~~~~~~~~~~~ @@ -299,7 +299,7 @@ Approach 1: Use the Trainer API and update the environment between calls to ``tr phase = 1 else: phase = 0 - trainer.optimizer.foreach_evaluator( + trainer.workers.foreach_worker( lambda ev: ev.foreach_env( lambda env: env.set_phase(phase))) @@ -333,7 +333,7 @@ Approach 2: Use the callbacks API to update the environment on new training resu else: phase = 0 trainer = info["trainer"] - trainer.optimizer.foreach_evaluator( + trainer.workers.foreach_worker( lambda ev: ev.foreach_env( lambda env: env.set_phase(phase))) diff --git a/python/ray/rllib/__init__.py b/python/ray/rllib/__init__.py index 92844e485..0824e9995 100644 --- a/python/ray/rllib/__init__.py +++ b/python/ray/rllib/__init__.py @@ -11,7 +11,7 @@ from ray.tune.registry import register_trainable from ray.rllib.evaluation.policy_graph import PolicyGraph from ray.rllib.evaluation.tf_policy_graph import TFPolicyGraph -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator +from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.env.base_env import BaseEnv from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.rllib.env.vector_env import VectorEnv @@ -55,6 +55,7 @@ __all__ = [ "PolicyGraph", "TFPolicy", "TFPolicyGraph", + "RolloutWorker", "PolicyEvaluator", "SampleBatch", "BaseEnv", diff --git a/python/ray/rllib/agents/a3c/a2c.py b/python/ray/rllib/agents/a3c/a2c.py index e18345030..0b6592e74 100644 --- a/python/ray/rllib/agents/a3c/a2c.py +++ b/python/ray/rllib/agents/a3c/a2c.py @@ -2,9 +2,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from ray.rllib.agents.a3c.a3c import A3CTrainer, DEFAULT_CONFIG as A3C_CONFIG -from ray.rllib.optimizers import SyncSamplesOptimizer -from ray.rllib.utils.annotations import override +from ray.rllib.agents.a3c.a3c import DEFAULT_CONFIG as A3C_CONFIG, \ + validate_config, get_policy_class +from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy +from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.utils import merge_dicts A2C_DEFAULT_CONFIG = merge_dicts( @@ -16,16 +17,9 @@ A2C_DEFAULT_CONFIG = merge_dicts( }, ) - -class A2CTrainer(A3CTrainer): - """Synchronous variant of the A3CTrainer.""" - - _name = "A2C" - _default_config = A2C_DEFAULT_CONFIG - - @override(A3CTrainer) - def _make_optimizer(self): - return SyncSamplesOptimizer( - self.local_evaluator, - self.remote_evaluators, - train_batch_size=self.config["train_batch_size"]) +A2CTrainer = build_trainer( + name="A2C", + default_config=A2C_DEFAULT_CONFIG, + default_policy=A3CTFPolicy, + get_policy_class=get_policy_class, + validate_config=validate_config) diff --git a/python/ray/rllib/agents/a3c/a3c.py b/python/ray/rllib/agents/a3c/a3c.py index 56d7a09da..c269df2fc 100644 --- a/python/ray/rllib/agents/a3c/a3c.py +++ b/python/ray/rllib/agents/a3c/a3c.py @@ -2,12 +2,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import time - from ray.rllib.agents.a3c.a3c_tf_policy import A3CTFPolicy -from ray.rllib.agents.trainer import Trainer, with_common_config +from ray.rllib.agents.trainer import with_common_config +from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.optimizers import AsyncGradientsOptimizer -from ray.rllib.utils.annotations import override # yapf: disable # __sphinx_doc_begin__ @@ -38,43 +36,28 @@ DEFAULT_CONFIG = with_common_config({ # yapf: enable -class A3CTrainer(Trainer): - """A3C implementations in TensorFlow and PyTorch.""" +def get_policy_class(config): + if config["use_pytorch"]: + from ray.rllib.agents.a3c.a3c_torch_policy import \ + A3CTorchPolicy + return A3CTorchPolicy + else: + return A3CTFPolicy - _name = "A3C" - _default_config = DEFAULT_CONFIG - _policy = A3CTFPolicy - @override(Trainer) - def _init(self, config, env_creator): - if config["use_pytorch"]: - from ray.rllib.agents.a3c.a3c_torch_policy import \ - A3CTorchPolicy - policy_cls = A3CTorchPolicy - else: - policy_cls = self._policy +def validate_config(config): + if config["entropy_coeff"] < 0: + raise DeprecationWarning("entropy_coeff must be >= 0") - if config["entropy_coeff"] < 0: - raise DeprecationWarning("entropy_coeff must be >= 0") - self.local_evaluator = self.make_local_evaluator( - env_creator, policy_cls) - self.remote_evaluators = self.make_remote_evaluators( - env_creator, policy_cls, config["num_workers"]) - self.optimizer = self._make_optimizer() +def make_async_optimizer(workers, config): + return AsyncGradientsOptimizer(workers, **config["optimizer"]) - @override(Trainer) - def _train(self): - prev_steps = self.optimizer.num_steps_sampled - start = time.time() - while time.time() - start < self.config["min_iter_time_s"]: - self.optimizer.step() - result = self.collect_metrics() - result.update(timesteps_this_iter=self.optimizer.num_steps_sampled - - prev_steps) - return result - def _make_optimizer(self): - return AsyncGradientsOptimizer(self.local_evaluator, - self.remote_evaluators, - **self.config["optimizer"]) +A3CTrainer = build_trainer( + name="A3C", + default_config=DEFAULT_CONFIG, + default_policy=A3CTFPolicy, + get_policy_class=get_policy_class, + validate_config=validate_config, + make_policy_optimizer=make_async_optimizer) diff --git a/python/ray/rllib/agents/ddpg/apex.py b/python/ray/rllib/agents/ddpg/apex.py index 24edbb226..5ea732f17 100644 --- a/python/ray/rllib/agents/ddpg/apex.py +++ b/python/ray/rllib/agents/ddpg/apex.py @@ -48,7 +48,7 @@ class ApexDDPGTrainer(DDPGTrainer): # Ape-X updates based on num steps trained, not sampled if self.optimizer.num_steps_trained - self.last_target_update_ts > \ self.config["target_network_update_freq"]: - self.local_evaluator.foreach_trainable_policy( + self.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) self.last_target_update_ts = self.optimizer.num_steps_trained self.num_target_updates += 1 diff --git a/python/ray/rllib/agents/ddpg/ddpg.py b/python/ray/rllib/agents/ddpg/ddpg.py index 66d3810e5..a9676335e 100644 --- a/python/ray/rllib/agents/ddpg/ddpg.py +++ b/python/ray/rllib/agents/ddpg/ddpg.py @@ -171,9 +171,9 @@ class DDPGTrainer(DQNTrainer): if pure_expl_steps: # tell workers whether they should do pure exploration only_explore = self.global_timestep < pure_expl_steps - self.local_evaluator.foreach_trainable_policy( + self.workers.local_worker().foreach_trainable_policy( lambda p, _: p.set_pure_exploration_phase(only_explore)) - for e in self.remote_evaluators: + for e in self.workers.remote_workers(): e.foreach_trainable_policy.remote( lambda p, _: p.set_pure_exploration_phase(only_explore)) return super(DDPGTrainer, self)._train() diff --git a/python/ray/rllib/agents/ddpg/ddpg_policy.py b/python/ray/rllib/agents/ddpg/ddpg_policy.py index b80cfce4c..bb5fc25ef 100644 --- a/python/ray/rllib/agents/ddpg/ddpg_policy.py +++ b/python/ray/rllib/agents/ddpg/ddpg_policy.py @@ -515,7 +515,7 @@ class DDPGTFPolicy(DDPGPostprocessing, TFPolicy): stochastic_actions = tf.cond( # need to condition on noise_scale > 0 because zeroing - # noise_scale is how evaluator signals no noise should be used + # noise_scale is how a worker signals no noise should be used # (this is ugly and should be fixed by adding an "eval_mode" # config flag or something) tf.logical_and(enable_pure_exploration, noise_scale > 0), diff --git a/python/ray/rllib/agents/dqn/apex.py b/python/ray/rllib/agents/dqn/apex.py index 27bde322a..129839a27 100644 --- a/python/ray/rllib/agents/dqn/apex.py +++ b/python/ray/rllib/agents/dqn/apex.py @@ -51,7 +51,7 @@ class ApexTrainer(DQNTrainer): # Ape-X updates based on num steps trained, not sampled if self.optimizer.num_steps_trained - self.last_target_update_ts > \ self.config["target_network_update_freq"]: - self.local_evaluator.foreach_trainable_policy( + self.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) self.last_target_update_ts = self.optimizer.num_steps_trained self.num_target_updates += 1 diff --git a/python/ray/rllib/agents/dqn/dqn.py b/python/ray/rllib/agents/dqn/dqn.py index 7fdb6f66b..15379e3fb 100644 --- a/python/ray/rllib/agents/dqn/dqn.py +++ b/python/ray/rllib/agents/dqn/dqn.py @@ -196,26 +196,26 @@ class DQNTrainer(Trainer): config["callbacks"]["on_episode_end"] = tune.function( on_episode_end) - self.local_evaluator = self.make_local_evaluator( - env_creator, self._policy) - - def create_remote_evaluators(): - return self.make_remote_evaluators(env_creator, self._policy, - config["num_workers"]) - if config["optimizer_class"] != "AsyncReplayOptimizer": - self.remote_evaluators = create_remote_evaluators() + self.workers = self._make_workers( + env_creator, + self._policy, + config, + num_workers=self.config["num_workers"]) + workers_needed = 0 else: # Hack to workaround https://github.com/ray-project/ray/issues/2541 - self.remote_evaluators = None + self.workers = self._make_workers( + env_creator, self._policy, config, num_workers=0) + workers_needed = self.config["num_workers"] self.optimizer = getattr(optimizers, config["optimizer_class"])( - self.local_evaluator, self.remote_evaluators, - **config["optimizer"]) - # Create the remote evaluators *after* the replay actors - if self.remote_evaluators is None: - self.remote_evaluators = create_remote_evaluators() - self.optimizer._set_evaluators(self.remote_evaluators) + self.workers, **config["optimizer"]) + + # Create the remote workers *after* the replay actors + if workers_needed > 0: + self.workers.add_workers(workers_needed) + self.optimizer._set_workers(self.workers.remote_workers()) self.last_target_update_ts = 0 self.num_target_updates = 0 @@ -226,9 +226,9 @@ class DQNTrainer(Trainer): # Update worker explorations exp_vals = [self.exploration0.value(self.global_timestep)] - self.local_evaluator.foreach_trainable_policy( + self.workers.local_worker().foreach_trainable_policy( lambda p, _: p.set_epsilon(exp_vals[0])) - for i, e in enumerate(self.remote_evaluators): + for i, e in enumerate(self.workers.remote_workers()): exp_val = self.explorations[i].value(self.global_timestep) e.foreach_trainable_policy.remote( lambda p, _: p.set_epsilon(exp_val)) @@ -245,8 +245,8 @@ class DQNTrainer(Trainer): if self.config["per_worker_exploration"]: # Only collect metrics from the third of workers with lowest eps result = self.collect_metrics( - selected_evaluators=self.remote_evaluators[ - -len(self.remote_evaluators) // 3:]) + selected_workers=self.workers.remote_workers()[ + -len(self.workers.remote_workers()) // 3:]) else: result = self.collect_metrics() @@ -263,7 +263,7 @@ class DQNTrainer(Trainer): def update_target_if_needed(self): if self.global_timestep - self.last_target_update_ts > \ self.config["target_network_update_freq"]: - self.local_evaluator.foreach_trainable_policy( + self.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) self.last_target_update_ts = self.global_timestep self.num_target_updates += 1 @@ -275,11 +275,13 @@ class DQNTrainer(Trainer): def _evaluate(self): logger.info("Evaluating current policy for {} episodes".format( self.config["evaluation_num_episodes"])) - self.evaluation_ev.restore(self.local_evaluator.save()) - self.evaluation_ev.foreach_policy(lambda p, _: p.set_epsilon(0)) + self.evaluation_workers.local_worker().restore( + self.workers.local_worker().save()) + self.evaluation_workers.local_worker().foreach_policy( + lambda p, _: p.set_epsilon(0)) for _ in range(self.config["evaluation_num_episodes"]): - self.evaluation_ev.sample() - metrics = collect_metrics(self.evaluation_ev) + self.evaluation_workers.local_worker().sample() + metrics = collect_metrics(self.evaluation_workers.local_worker()) return {"evaluation": metrics} def _make_exploration_schedule(self, worker_index): diff --git a/python/ray/rllib/agents/es/es.py b/python/ray/rllib/agents/es/es.py index e167129c6..f5338a632 100644 --- a/python/ray/rllib/agents/es/es.py +++ b/python/ray/rllib/agents/es/es.py @@ -192,7 +192,7 @@ class ESTrainer(Trainer): # Create the actors. logger.info("Creating actors.") - self.workers = [ + self._workers = [ Worker.remote(config, policy_params, env_creator, noise_id) for _ in range(config["num_workers"]) ] @@ -270,7 +270,7 @@ class ESTrainer(Trainer): # Now sync the filters FilterManager.synchronize({ DEFAULT_POLICY_ID: self.policy.get_filter() - }, self.workers) + }, self._workers) info = { "weights_norm": np.square(theta).sum(), @@ -296,7 +296,7 @@ class ESTrainer(Trainer): @override(Trainer) def _stop(self): # workaround for https://github.com/ray-project/ray/issues/1516 - for w in self.workers: + for w in self._workers: w.__ray_terminate__.remote() def _collect_results(self, theta_id, min_episodes, min_timesteps): @@ -307,7 +307,7 @@ class ESTrainer(Trainer): "Collected {} episodes {} timesteps so far this iter".format( num_episodes, num_timesteps)) rollout_ids = [ - worker.do_rollouts.remote(theta_id) for worker in self.workers + worker.do_rollouts.remote(theta_id) for worker in self._workers ] # Get the results of the rollouts. for result in ray_get_and_free(rollout_ids): @@ -334,4 +334,4 @@ class ESTrainer(Trainer): self.policy.set_filter(state["filter"]) FilterManager.synchronize({ DEFAULT_POLICY_ID: self.policy.get_filter() - }, self.workers) + }, self._workers) diff --git a/python/ray/rllib/agents/impala/impala.py b/python/ray/rllib/agents/impala/impala.py index 838f2975c..e025a4817 100644 --- a/python/ray/rllib/agents/impala/impala.py +++ b/python/ray/rllib/agents/impala/impala.py @@ -113,18 +113,16 @@ class ImpalaTrainer(Trainer): if k not in config["optimizer"]: config["optimizer"][k] = config[k] policy_cls = self._get_policy() - self.local_evaluator = self.make_local_evaluator( - self.env_creator, policy_cls) + self.workers = self._make_workers( + self.env_creator, policy_cls, self.config, num_workers=0) if self.config["num_aggregation_workers"] > 0: # Create co-located aggregator actors first for placement pref aggregators = TreeAggregator.precreate_aggregators( self.config["num_aggregation_workers"]) - self.remote_evaluators = self.make_remote_evaluators( - env_creator, policy_cls, config["num_workers"]) - self.optimizer = AsyncSamplesOptimizer(self.local_evaluator, - self.remote_evaluators, + self.workers.add_workers(config["num_workers"]) + self.optimizer = AsyncSamplesOptimizer(self.workers, **config["optimizer"]) if config["entropy_coeff"] < 0: raise DeprecationWarning("entropy_coeff must be >= 0") diff --git a/python/ray/rllib/agents/marwil/marwil.py b/python/ray/rllib/agents/marwil/marwil.py index d6c6eadea..b8e01806c 100644 --- a/python/ray/rllib/agents/marwil/marwil.py +++ b/python/ray/rllib/agents/marwil/marwil.py @@ -48,13 +48,10 @@ class MARWILTrainer(Trainer): @override(Trainer) def _init(self, config, env_creator): - self.local_evaluator = self.make_local_evaluator( - env_creator, self._policy) - self.remote_evaluators = self.make_remote_evaluators( - env_creator, self._policy, config["num_workers"]) + self.workers = self._make_workers(env_creator, self._policy, config, + config["num_workers"]) self.optimizer = SyncBatchReplayOptimizer( - self.local_evaluator, - self.remote_evaluators, + self.workers, learning_starts=config["learning_starts"], buffer_size=config["replay_buffer_size"], train_batch_size=config["train_batch_size"], diff --git a/python/ray/rllib/agents/pg/pg.py b/python/ray/rllib/agents/pg/pg.py index 299cdcac3..71e2ab3fb 100644 --- a/python/ray/rllib/agents/pg/pg.py +++ b/python/ray/rllib/agents/pg/pg.py @@ -29,7 +29,7 @@ def get_policy_class(config): PGTrainer = build_trainer( - name="PGTrainer", + name="PG", default_config=DEFAULT_CONFIG, default_policy=PGTFPolicy, get_policy_class=get_policy_class) diff --git a/python/ray/rllib/agents/ppo/ppo.py b/python/ray/rllib/agents/ppo/ppo.py index daf43d148..a21c3d28f 100644 --- a/python/ray/rllib/agents/ppo/ppo.py +++ b/python/ray/rllib/agents/ppo/ppo.py @@ -63,17 +63,15 @@ DEFAULT_CONFIG = with_common_config({ # yapf: enable -def choose_policy_optimizer(local_evaluator, remote_evaluators, config): +def choose_policy_optimizer(workers, config): if config["simple_optimizer"]: return SyncSamplesOptimizer( - local_evaluator, - remote_evaluators, + workers, num_sgd_iter=config["num_sgd_iter"], train_batch_size=config["train_batch_size"]) return LocalMultiGPUOptimizer( - local_evaluator, - remote_evaluators, + workers, sgd_batch_size=config["sgd_minibatch_size"], num_sgd_iter=config["num_sgd_iter"], num_gpus=config["num_gpus"], @@ -87,7 +85,7 @@ def choose_policy_optimizer(local_evaluator, remote_evaluators, config): def update_kl(trainer, fetches): if "kl" in fetches: # single-agent - trainer.local_evaluator.for_policy( + trainer.workers.local_worker().for_policy( lambda pi: pi.update_kl(fetches["kl"])) else: @@ -98,7 +96,7 @@ def update_kl(trainer, fetches): logger.debug("No data for {}, not updating kl".format(pi_id)) # multi-agent - trainer.local_evaluator.foreach_trainable_policy(update) + trainer.workers.local_worker().foreach_trainable_policy(update) def warn_about_obs_filter(trainer): @@ -155,7 +153,7 @@ def validate_config(config): PPOTrainer = build_trainer( - name="PPOTrainer", + name="PPO", default_config=DEFAULT_CONFIG, default_policy=PPOTFPolicy, make_policy_optimizer=choose_policy_optimizer, diff --git a/python/ray/rllib/agents/qmix/apex.py b/python/ray/rllib/agents/qmix/apex.py index f43a5ac12..65c91d655 100644 --- a/python/ray/rllib/agents/qmix/apex.py +++ b/python/ray/rllib/agents/qmix/apex.py @@ -50,7 +50,7 @@ class ApexQMixTrainer(QMixTrainer): # Ape-X updates based on num steps trained, not sampled if self.optimizer.num_steps_trained - self.last_target_update_ts > \ self.config["target_network_update_freq"]: - self.local_evaluator.foreach_trainable_policy( + self.workers.local_worker().foreach_trainable_policy( lambda p, _: p.update_target()) self.last_target_update_ts = self.optimizer.num_steps_trained self.num_target_updates += 1 diff --git a/python/ray/rllib/agents/trainer.py b/python/ray/rllib/agents/trainer.py index fb20f56ba..f08b23e93 100644 --- a/python/ray/rllib/agents/trainer.py +++ b/python/ray/rllib/agents/trainer.py @@ -10,18 +10,14 @@ import pickle import six import time import tempfile -from types import FunctionType import ray from ray.exceptions import RayError -from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \ - ShuffledInput from ray.rllib.models import MODEL_DEFAULTS -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator, \ - _validate_multiagent_config from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer +from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.utils.annotations import override, PublicAPI, DeveloperAPI from ray.rllib.utils import FilterManager, deep_update, merge_dicts from ray.rllib.utils.memory import ray_get_and_free @@ -46,7 +42,7 @@ COMMON_CONFIG = { # === Debugging === # Whether to write episode stats and videos to the agent log dir "monitor": False, - # Set the ray.rllib.* log level for the agent process and its evaluators. + # Set the ray.rllib.* log level for the agent process and its workers. # Should be one of DEBUG, INFO, WARN, or ERROR. The DEBUG level will also # periodically print out summaries of relevant internal dataflow (this is # also printed out once at startup at the INFO level). @@ -60,7 +56,7 @@ COMMON_CONFIG = { "on_episode_start": None, # arg: {"env": .., "episode": ...} "on_episode_step": None, # arg: {"env": .., "episode": ...} "on_episode_end": None, # arg: {"env": .., "episode": ...} - "on_sample_end": None, # arg: {"samples": .., "evaluator": ...} + "on_sample_end": None, # arg: {"samples": .., "worker": ...} "on_train_result": None, # arg: {"trainer": ..., "result": ...} "on_postprocess_traj": None, # arg: { # "agent_id": ..., "episode": ..., @@ -153,7 +149,7 @@ COMMON_CONFIG = { "synchronize_filters": True, # Configure TF for single-process operation by default "tf_session_args": { - # note: overriden by `local_evaluator_tf_session_args` + # note: overriden by `local_tf_session_args` "intra_op_parallelism_threads": 2, "inter_op_parallelism_threads": 2, "gpu_options": { @@ -165,8 +161,8 @@ COMMON_CONFIG = { }, "allow_soft_placement": True, # required by PPO multi-gpu }, - # Override the following tf session args on the local evaluator - "local_evaluator_tf_session_args": { + # Override the following tf session args on the local worker + "local_tf_session_args": { # Allow a higher level of parallelism by default, but not unlimited # since that can cause crashes with many concurrent drivers. "intra_op_parallelism_threads": 8, @@ -188,6 +184,8 @@ COMMON_CONFIG = { # but optimal value could be obtained by measuring your environment # step / reset and model inference perf. "remote_env_batch_wait_ms": 0, + # Minimum time per iteration + "min_iter_time_s": 0, # === Offline Datasets === # Specify how to generate experiences: @@ -229,7 +227,7 @@ COMMON_CONFIG = { # === Multiagent === "multiagent": { # Map from policy ids to tuples of (policy_cls, obs_space, - # act_space, config). See policy_evaluator.py for more info. + # act_space, config). See rollout_worker.py for more info. "policies": {}, # Function mapping agent ids to policy ids. "policy_mapping_fn": None, @@ -292,7 +290,7 @@ class Trainer(Trainable): config = config or {} - # Vars to synchronize to evaluators on each train call + # Vars to synchronize to workers on each train call self.global_vars = {"timestep": 0} # Trainers allow env ids to be passed directly to the constructor. @@ -337,9 +335,10 @@ class Trainer(Trainable): if self._has_policy_optimizer(): self.global_vars["timestep"] = self.optimizer.num_steps_sampled - self.optimizer.local_evaluator.set_global_vars(self.global_vars) - for ev in self.optimizer.remote_evaluators: - ev.set_global_vars.remote(self.global_vars) + self.optimizer.workers.local_worker().set_global_vars( + self.global_vars) + for w in self.optimizer.workers.remote_workers(): + w.set_global_vars.remote(self.global_vars) logger.debug("updated global vars: {}".format(self.global_vars)) result = None @@ -366,17 +365,18 @@ class Trainer(Trainable): raise RuntimeError("Failed to recover from worker crash") if (self.config.get("observation_filter", "NoFilter") != "NoFilter" - and hasattr(self, "local_evaluator")): + and hasattr(self, "workers") + and isinstance(self.workers, WorkerSet)): FilterManager.synchronize( - self.local_evaluator.filters, - self.remote_evaluators, + self.workers.local_worker().filters, + self.workers.remote_workers(), update_remote=self.config["synchronize_filters"]) logger.debug("synchronized filters: {}".format( - self.local_evaluator.filters)) + self.workers.local_worker().filters)) if self._has_policy_optimizer(): result["num_healthy_workers"] = len( - self.optimizer.remote_evaluators) + self.optimizer.workers.remote_workers()) if self.config["evaluation_interval"]: if self._iteration % self.config["evaluation_interval"] == 0: @@ -441,25 +441,17 @@ class Trainer(Trainable): }) logger.debug( "using evaluation_config: {}".format(extra_config)) - # Make local evaluation evaluators - self.evaluation_ev = self.make_local_evaluator( - self.env_creator, self._policy, extra_config=extra_config) + self.evaluation_workers = self._make_workers( + self.env_creator, + self._policy, + merge_dicts(self.config, extra_config), + num_workers=0) self.evaluation_metrics = self._evaluate() @override(Trainable) def _stop(self): - # Call stop on all evaluators to release resources - if hasattr(self, "local_evaluator"): - self.local_evaluator.stop() - if hasattr(self, "remote_evaluators"): - for ev in self.remote_evaluators: - ev.stop.remote() - - # workaround for https://github.com/ray-project/ray/issues/1516 - if hasattr(self, "remote_evaluators"): - for ev in self.remote_evaluators: - ev.__ray_terminate__.remote() - + if hasattr(self, "workers"): + self.workers.stop() if hasattr(self, "optimizer"): self.optimizer.stop() @@ -475,6 +467,15 @@ class Trainer(Trainable): extra_data = pickle.load(open(checkpoint_path, "rb")) self.__setstate__(extra_data) + @DeveloperAPI + def _make_workers(self, env_creator, policy, config, num_workers): + return WorkerSet( + env_creator, + policy, + config, + num_workers=num_workers, + logdir=self.logdir) + @DeveloperAPI def _init(self, config, env_creator): """Subclasses should override this for custom initialization.""" @@ -498,11 +499,12 @@ class Trainer(Trainable): logger.info("Evaluating current policy for {} episodes".format( self.config["evaluation_num_episodes"])) - self.evaluation_ev.restore(self.local_evaluator.save()) + self.evaluation_workers.local_worker().restore( + self.workers.local_worker().save()) for _ in range(self.config["evaluation_num_episodes"]): - self.evaluation_ev.sample() + self.evaluation_workers.local_worker().sample() - metrics = collect_metrics(self.evaluation_ev) + metrics = collect_metrics(self.evaluation_workers.local_worker()) return {"evaluation": metrics} @PublicAPI @@ -540,9 +542,9 @@ class Trainer(Trainable): if state is None: state = [] - preprocessed = self.local_evaluator.preprocessors[policy_id].transform( - observation) - filtered_obs = self.local_evaluator.filters[policy_id]( + preprocessed = self.workers.local_worker().preprocessors[ + policy_id].transform(observation) + filtered_obs = self.workers.local_worker().filters[policy_id]( preprocessed, update=False) if state: return self.get_policy(policy_id).compute_single_action( @@ -590,7 +592,7 @@ class Trainer(Trainable): policy_id (str): id of policy to return. """ - return self.local_evaluator.get_policy(policy_id) + return self.workers.local_worker().get_policy(policy_id) @PublicAPI def get_weights(self, policies=None): @@ -600,7 +602,7 @@ class Trainer(Trainable): policies (list): Optional list of policies to return weights for, or None for all policies. """ - return self.local_evaluator.get_weights(policies) + return self.workers.local_worker().get_weights(policies) @PublicAPI def set_weights(self, weights): @@ -609,42 +611,7 @@ class Trainer(Trainable): Arguments: weights (dict): Map of policy ids to weights to set. """ - self.local_evaluator.set_weights(weights) - - @DeveloperAPI - def make_local_evaluator(self, env_creator, policy, extra_config=None): - """Convenience method to return configured local evaluator.""" - - return self._make_evaluator( - PolicyEvaluator, - env_creator, - policy, - 0, - merge_dicts( - # important: allow local tf to use more CPUs for optimization - merge_dicts( - self.config, { - "tf_session_args": self. - config["local_evaluator_tf_session_args"] - }), - extra_config or {})) - - @DeveloperAPI - def make_remote_evaluators(self, env_creator, policy, count): - """Convenience method to return a number of remote evaluators.""" - - remote_args = { - "num_cpus": self.config["num_cpus_per_worker"], - "num_gpus": self.config["num_gpus_per_worker"], - "resources": self.config["custom_resources_per_worker"], - } - - cls = PolicyEvaluator.as_remote(**remote_args).remote - - return [ - self._make_evaluator(cls, env_creator, policy, i + 1, self.config) - for i in range(count) - ] + self.workers.local_worker().set_weights(weights) @DeveloperAPI def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID): @@ -660,7 +627,7 @@ class Trainer(Trainable): >>> trainer.train() >>> trainer.export_policy_model("/tmp/export_dir") """ - self.local_evaluator.export_policy_model(export_dir, policy_id) + self.workers.local_worker().export_policy_model(export_dir, policy_id) @DeveloperAPI def export_policy_checkpoint(self, @@ -680,19 +647,19 @@ class Trainer(Trainable): >>> trainer.train() >>> trainer.export_policy_checkpoint("/tmp/export_dir") """ - self.local_evaluator.export_policy_checkpoint( + self.workers.local_worker().export_policy_checkpoint( export_dir, filename_prefix, policy_id) @DeveloperAPI - def collect_metrics(self, selected_evaluators=None): - """Collects metrics from the remote evaluators of this agent. + def collect_metrics(self, selected_workers=None): + """Collects metrics from the remote workers of this agent. This is the same data as returned by a call to train(). """ return self.optimizer.collect_metrics( self.config["collect_metrics_timeout"], min_history=self.config["metrics_smoothing_episodes"], - selected_evaluators=selected_evaluators) + selected_workers=selected_workers) @classmethod def resource_help(cls, config): @@ -742,118 +709,34 @@ class Trainer(Trainable): logger.info("Health checking all workers...") checks = [] - for ev in self.optimizer.remote_evaluators: + for ev in self.optimizer.workers.remote_workers(): _, obj_id = ev.sample_with_count.remote() checks.append(obj_id) - healthy_evaluators = [] + healthy_workers = [] for i, obj_id in enumerate(checks): - ev = self.optimizer.remote_evaluators[i] + w = self.optimizer.workers.remote_workers()[i] try: ray_get_and_free(obj_id) - healthy_evaluators.append(ev) + healthy_workers.append(w) logger.info("Worker {} looks healthy".format(i + 1)) except RayError: logger.exception("Blacklisting worker {}".format(i + 1)) try: - ev.__ray_terminate__.remote() + w.__ray_terminate__.remote() except Exception: logger.exception("Error terminating unhealthy worker") - if len(healthy_evaluators) < 1: + if len(healthy_workers) < 1: raise RuntimeError( "Not enough healthy workers remain to continue.") - self.optimizer.reset(healthy_evaluators) + self.optimizer.reset(healthy_workers) def _has_policy_optimizer(self): return hasattr(self, "optimizer") and isinstance( self.optimizer, PolicyOptimizer) - def _make_evaluator(self, cls, env_creator, policy, worker_index, config): - def session_creator(): - logger.debug("Creating TF session {}".format( - config["tf_session_args"])) - return tf.Session( - config=tf.ConfigProto(**config["tf_session_args"])) - - if isinstance(config["input"], FunctionType): - input_creator = config["input"] - elif config["input"] == "sampler": - input_creator = (lambda ioctx: ioctx.default_sampler_input()) - elif isinstance(config["input"], dict): - input_creator = (lambda ioctx: ShuffledInput( - MixedInput(config["input"], ioctx), config[ - "shuffle_buffer_size"])) - else: - input_creator = (lambda ioctx: ShuffledInput( - JsonReader(config["input"], ioctx), config[ - "shuffle_buffer_size"])) - - if isinstance(config["output"], FunctionType): - output_creator = config["output"] - elif config["output"] is None: - output_creator = (lambda ioctx: NoopOutput()) - elif config["output"] == "logdir": - output_creator = (lambda ioctx: JsonWriter( - ioctx.log_dir, - ioctx, - max_file_size=config["output_max_file_size"], - compress_columns=config["output_compress_columns"])) - else: - output_creator = (lambda ioctx: JsonWriter( - config["output"], - ioctx, - max_file_size=config["output_max_file_size"], - compress_columns=config["output_compress_columns"])) - - if config["input"] == "sampler": - input_evaluation = [] - else: - input_evaluation = config["input_evaluation"] - - # Fill in the default policy if 'None' is specified in multiagent - if self.config["multiagent"]["policies"]: - tmp = self.config["multiagent"]["policies"] - _validate_multiagent_config(tmp, allow_none_graph=True) - for k, v in tmp.items(): - if v[0] is None: - tmp[k] = (policy, v[1], v[2], v[3]) - policy = tmp - - return cls( - env_creator, - policy, - policy_mapping_fn=self.config["multiagent"]["policy_mapping_fn"], - policies_to_train=self.config["multiagent"]["policies_to_train"], - tf_session_creator=(session_creator - if config["tf_session_args"] else None), - batch_steps=config["sample_batch_size"], - batch_mode=config["batch_mode"], - episode_horizon=config["horizon"], - preprocessor_pref=config["preprocessor_pref"], - sample_async=config["sample_async"], - compress_observations=config["compress_observations"], - num_envs=config["num_envs_per_worker"], - observation_filter=config["observation_filter"], - clip_rewards=config["clip_rewards"], - clip_actions=config["clip_actions"], - env_config=config["env_config"], - model_config=config["model"], - policy_config=config, - worker_index=worker_index, - monitor_path=self.logdir if config["monitor"] else None, - log_dir=self.logdir, - log_level=config["log_level"], - callbacks=config["callbacks"], - input_creator=input_creator, - input_evaluation=input_evaluation, - output_creator=output_creator, - remote_worker_envs=config["remote_worker_envs"], - remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], - soft_horizon=config["soft_horizon"], - _fake_sampler=config.get("_fake_sampler", False)) - @override(Trainable) def _export_model(self, export_formats, export_dir): ExportFormat.validate(export_formats) @@ -870,17 +753,17 @@ class Trainer(Trainable): def __getstate__(self): state = {} - if hasattr(self, "local_evaluator"): - state["evaluator"] = self.local_evaluator.save() + if hasattr(self, "workers"): + state["worker"] = self.workers.local_worker().save() if hasattr(self, "optimizer") and hasattr(self.optimizer, "save"): state["optimizer"] = self.optimizer.save() return state def __setstate__(self, state): - if "evaluator" in state: - self.local_evaluator.restore(state["evaluator"]) - remote_state = ray.put(state["evaluator"]) - for r in self.remote_evaluators: + if "worker" in state: + self.workers.local_worker().restore(state["worker"]) + remote_state = ray.put(state["worker"]) + for r in self.workers.remote_workers(): r.restore.remote(remote_state) if "optimizer" in state: self.optimizer.restore(state["optimizer"]) diff --git a/python/ray/rllib/agents/trainer_template.py b/python/ray/rllib/agents/trainer_template.py index aae8e35f6..6af9e1c78 100644 --- a/python/ray/rllib/agents/trainer_template.py +++ b/python/ray/rllib/agents/trainer_template.py @@ -2,6 +2,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import time + from ray.rllib.agents.trainer import Trainer, COMMON_CONFIG from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.utils.annotations import override, DeveloperAPI @@ -25,8 +27,7 @@ def build_trainer(name, default_config (dict): the default config dict of the algorithm, otherwises uses the Trainer default config make_policy_optimizer (func): optional function that returns a - PolicyOptimizer instance given - (local_evaluator, remote_evaluators, config) + PolicyOptimizer instance given (WorkerSet, config) validate_config (func): optional callback that checks a given config for correctness. It may mutate the config as needed. get_policy_class (func): optional callback that takes a config and @@ -44,8 +45,7 @@ def build_trainer(name, a Trainer instance that uses the specified args. """ - if not name.endswith("Trainer"): - raise ValueError("Algorithm name should have *Trainer suffix", name) + original_kwargs = locals().copy() class trainer_cls(Trainer): _name = name @@ -59,19 +59,15 @@ def build_trainer(name, policy = default_policy else: policy = get_policy_class(config) - self.local_evaluator = self.make_local_evaluator( - env_creator, policy) - self.remote_evaluators = self.make_remote_evaluators( - env_creator, policy, config["num_workers"]) + self.workers = self._make_workers(env_creator, policy, config, + self.config["num_workers"]) if make_policy_optimizer: - self.optimizer = make_policy_optimizer( - self.local_evaluator, self.remote_evaluators, config) + self.optimizer = make_policy_optimizer(self.workers, config) else: optimizer_config = dict( config["optimizer"], **{"train_batch_size": config["train_batch_size"]}) - self.optimizer = SyncSamplesOptimizer(self.local_evaluator, - self.remote_evaluators, + self.optimizer = SyncSamplesOptimizer(self.workers, **optimizer_config) @override(Trainer) @@ -79,9 +75,15 @@ def build_trainer(name, if before_train_step: before_train_step(self) prev_steps = self.optimizer.num_steps_sampled - fetches = self.optimizer.step() - if after_optimizer_step: - after_optimizer_step(self, fetches) + + start = time.time() + while True: + fetches = self.optimizer.step() + if after_optimizer_step: + after_optimizer_step(self, fetches) + if time.time() - start > self.config["min_iter_time_s"]: + break + res = self.collect_metrics() res.update( timesteps_this_iter=self.optimizer.num_steps_sampled - @@ -91,6 +93,11 @@ def build_trainer(name, after_train_result(self, res) return res + @staticmethod + def with_updates(**overrides): + return build_trainer(**dict(original_kwargs, **overrides)) + + trainer_cls.with_updates = with_updates trainer_cls.__name__ = name trainer_cls.__qualname__ = name return trainer_cls diff --git a/python/ray/rllib/env/base_env.py b/python/ray/rllib/env/base_env.py index 5db799c32..a36c3e228 100644 --- a/python/ray/rllib/env/base_env.py +++ b/python/ray/rllib/env/base_env.py @@ -21,7 +21,7 @@ class BaseEnv(object): can be sent back via send_actions(). All other env types can be adapted to BaseEnv. RLlib handles these - conversions internally in PolicyEvaluator, for example: + conversions internally in RolloutWorker, for example: gym.Env => rllib.VectorEnv => rllib.BaseEnv rllib.MultiAgentEnv => rllib.BaseEnv diff --git a/python/ray/rllib/evaluation/__init__.py b/python/ray/rllib/evaluation/__init__.py index 7e56bb747..f743cca64 100644 --- a/python/ray/rllib/evaluation/__init__.py +++ b/python/ray/rllib/evaluation/__init__.py @@ -1,4 +1,5 @@ from ray.rllib.evaluation.episode import MultiAgentEpisode +from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator from ray.rllib.evaluation.interface import EvaluatorInterface from ray.rllib.evaluation.policy_graph import PolicyGraph @@ -12,8 +13,19 @@ from ray.rllib.evaluation.postprocessing import compute_advantages from ray.rllib.evaluation.metrics import collect_metrics __all__ = [ - "EvaluatorInterface", "PolicyEvaluator", "PolicyGraph", "TFPolicyGraph", - "TorchPolicyGraph", "SampleBatch", "MultiAgentBatch", "SampleBatchBuilder", - "MultiAgentSampleBatchBuilder", "SyncSampler", "AsyncSampler", - "compute_advantages", "collect_metrics", "MultiAgentEpisode" + "EvaluatorInterface", + "RolloutWorker", + "PolicyGraph", + "TFPolicyGraph", + "TorchPolicyGraph", + "SampleBatch", + "MultiAgentBatch", + "SampleBatchBuilder", + "MultiAgentSampleBatchBuilder", + "SyncSampler", + "AsyncSampler", + "compute_advantages", + "collect_metrics", + "MultiAgentEpisode", + "PolicyEvaluator", ] diff --git a/python/ray/rllib/evaluation/interface.py b/python/ray/rllib/evaluation/interface.py index 6bc626da1..06fa9f94e 100644 --- a/python/ray/rllib/evaluation/interface.py +++ b/python/ray/rllib/evaluation/interface.py @@ -11,7 +11,7 @@ from ray.rllib.utils.annotations import DeveloperAPI class EvaluatorInterface(object): """This is the interface between policy optimizers and policy evaluation. - See also: PolicyEvaluator + See also: RolloutWorker """ @DeveloperAPI diff --git a/python/ray/rllib/evaluation/metrics.py b/python/ray/rllib/evaluation/metrics.py index d8b3122fe..341327608 100644 --- a/python/ray/rllib/evaluation/metrics.py +++ b/python/ray/rllib/evaluation/metrics.py @@ -39,27 +39,23 @@ def get_learner_stats(grad_info): @DeveloperAPI -def collect_metrics(local_evaluator=None, - remote_evaluators=[], - timeout_seconds=180): - """Gathers episode metrics from PolicyEvaluator instances.""" +def collect_metrics(local_worker=None, remote_workers=[], timeout_seconds=180): + """Gathers episode metrics from RolloutWorker instances.""" episodes, num_dropped = collect_episodes( - local_evaluator, remote_evaluators, timeout_seconds=timeout_seconds) + local_worker, remote_workers, timeout_seconds=timeout_seconds) metrics = summarize_episodes(episodes, episodes, num_dropped) return metrics @DeveloperAPI -def collect_episodes(local_evaluator=None, - remote_evaluators=[], +def collect_episodes(local_worker=None, remote_workers=[], timeout_seconds=180): """Gathers new episodes metrics tuples from the given evaluators.""" - if remote_evaluators: + if remote_workers: pending = [ - a.apply.remote(lambda ev: ev.get_metrics()) - for a in remote_evaluators + a.apply.remote(lambda ev: ev.get_metrics()) for a in remote_workers ] collected, _ = ray.wait( pending, num_returns=len(pending), timeout=timeout_seconds * 1.0) @@ -73,8 +69,8 @@ def collect_episodes(local_evaluator=None, metric_lists = [] num_metric_batches_dropped = 0 - if local_evaluator: - metric_lists.append(local_evaluator.get_metrics()) + if local_worker: + metric_lists.append(local_worker.get_metrics()) episodes = [] for metrics in metric_lists: episodes.extend(metrics) diff --git a/python/ray/rllib/evaluation/policy_evaluator.py b/python/ray/rllib/evaluation/policy_evaluator.py index 40df71006..18dec8abc 100644 --- a/python/ray/rllib/evaluation/policy_evaluator.py +++ b/python/ray/rllib/evaluation/policy_evaluator.py @@ -2,805 +2,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import gym -import logging -import pickle +from ray.rllib.utils import renamed_class +from ray.rllib.evaluation import RolloutWorker -import ray -from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari -from ray.rllib.env.base_env import BaseEnv -from ray.rllib.env.env_context import EnvContext -from ray.rllib.env.external_env import ExternalEnv -from ray.rllib.env.multi_agent_env import MultiAgentEnv -from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv -from ray.rllib.env.vector_env import VectorEnv -from ray.rllib.evaluation.interface import EvaluatorInterface -from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler -from ray.rllib.policy.sample_batch import MultiAgentBatch, DEFAULT_POLICY_ID -from ray.rllib.policy.policy import Policy -from ray.rllib.policy.tf_policy import TFPolicy -from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader -from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator -from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator -from ray.rllib.models import ModelCatalog -from ray.rllib.models.preprocessors import NoPreprocessor -from ray.rllib.utils import merge_dicts -from ray.rllib.utils.annotations import override, DeveloperAPI -from ray.rllib.utils.debug import disable_log_once_globally, log_once, \ - summarize, enable_periodic_logging -from ray.rllib.utils.filter import get_filter -from ray.rllib.utils.tf_run_builder import TFRunBuilder -from ray.rllib.utils import try_import_tf - -tf = try_import_tf() -logger = logging.getLogger(__name__) - -# Handle to the current evaluator, which will be set to the most recently -# created PolicyEvaluator in this process. This can be helpful to access in -# custom env or policy classes for debugging or advanced use cases. -_global_evaluator = None - - -@DeveloperAPI -def get_global_evaluator(): - """Returns a handle to the active policy evaluator in this process.""" - - global _global_evaluator - return _global_evaluator - - -@DeveloperAPI -class PolicyEvaluator(EvaluatorInterface): - """Common ``PolicyEvaluator`` implementation that wraps a ``Policy``. - - This class wraps a policy instance and an environment class to - collect experiences from the environment. You can create many replicas of - this class as Ray actors to scale RL training. - - This class supports vectorized and multi-agent policy evaluation (e.g., - VectorEnv, MultiAgentEnv, etc.) - - Examples: - >>> # Create a policy evaluator and using it to collect experiences. - >>> evaluator = PolicyEvaluator( - ... env_creator=lambda _: gym.make("CartPole-v0"), - ... policy=PGTFPolicy) - >>> print(evaluator.sample()) - SampleBatch({ - "obs": [[...]], "actions": [[...]], "rewards": [[...]], - "dones": [[...]], "new_obs": [[...]]}) - - >>> # Creating policy evaluators using optimizer_cls.make(). - >>> optimizer = SyncSamplesOptimizer.make( - ... evaluator_cls=PolicyEvaluator, - ... evaluator_args={ - ... "env_creator": lambda _: gym.make("CartPole-v0"), - ... "policy": PGTFPolicy, - ... }, - ... num_workers=10) - >>> for _ in range(10): optimizer.step() - - >>> # Creating a multi-agent policy evaluator - >>> evaluator = PolicyEvaluator( - ... env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25), - ... policies={ - ... # Use an ensemble of two policies for car agents - ... "car_policy1": - ... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}), - ... "car_policy2": - ... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}), - ... # Use a single shared policy for all traffic lights - ... "traffic_light_policy": - ... (PGTFPolicy, Box(...), Discrete(...), {}), - ... }, - ... policy_mapping_fn=lambda agent_id: - ... random.choice(["car_policy1", "car_policy2"]) - ... if agent_id.startswith("car_") else "traffic_light_policy") - >>> print(evaluator.sample()) - MultiAgentBatch({ - "car_policy1": SampleBatch(...), - "car_policy2": SampleBatch(...), - "traffic_light_policy": SampleBatch(...)}) - """ - - @DeveloperAPI - @classmethod - def as_remote(cls, num_cpus=None, num_gpus=None, resources=None): - return ray.remote( - num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)(cls) - - @DeveloperAPI - def __init__(self, - env_creator, - policy, - policy_mapping_fn=None, - policies_to_train=None, - tf_session_creator=None, - batch_steps=100, - batch_mode="truncate_episodes", - episode_horizon=None, - preprocessor_pref="deepmind", - sample_async=False, - compress_observations=False, - num_envs=1, - observation_filter="NoFilter", - clip_rewards=None, - clip_actions=True, - env_config=None, - model_config=None, - policy_config=None, - worker_index=0, - monitor_path=None, - log_dir=None, - log_level=None, - callbacks=None, - input_creator=lambda ioctx: ioctx.default_sampler_input(), - input_evaluation=frozenset([]), - output_creator=lambda ioctx: NoopOutput(), - remote_worker_envs=False, - remote_env_batch_wait_ms=0, - soft_horizon=False, - _fake_sampler=False): - """Initialize a policy evaluator. - - Arguments: - env_creator (func): Function that returns a gym.Env given an - EnvContext wrapped configuration. - policy (class|dict): Either a class implementing - Policy, or a dictionary of policy id strings to - (Policy, obs_space, action_space, config) tuples. If a - dict is specified, then we are in multi-agent mode and a - policy_mapping_fn should also be set. - policy_mapping_fn (func): A function that maps agent ids to - policy ids in multi-agent mode. This function will be called - each time a new agent appears in an episode, to bind that agent - to a policy for the duration of the episode. - policies_to_train (list): Optional whitelist of policies to train, - or None for all policies. - tf_session_creator (func): A function that returns a TF session. - This is optional and only useful with TFPolicy. - batch_steps (int): The target number of env transitions to include - in each sample batch returned from this evaluator. - batch_mode (str): One of the following batch modes: - "truncate_episodes": Each call to sample() will return a batch - of at most `batch_steps * num_envs` in size. The batch will - be exactly `batch_steps * num_envs` in size if - postprocessing does not change batch sizes. Episodes may be - truncated in order to meet this size requirement. - "complete_episodes": Each call to sample() will return a batch - of at least `batch_steps * num_envs` in size. Episodes will - not be truncated, but multiple episodes may be packed - within one batch to meet the batch size. Note that when - `num_envs > 1`, episode steps will be buffered until the - episode completes, and hence batches may contain - significant amounts of off-policy data. - episode_horizon (int): Whether to stop episodes at this horizon. - preprocessor_pref (str): Whether to prefer RLlib preprocessors - ("rllib") or deepmind ("deepmind") when applicable. - sample_async (bool): Whether to compute samples asynchronously in - the background, which improves throughput but can cause samples - to be slightly off-policy. - compress_observations (bool): If true, compress the observations. - They can be decompressed with rllib/utils/compression. - num_envs (int): If more than one, will create multiple envs - and vectorize the computation of actions. This has no effect if - if the env already implements VectorEnv. - observation_filter (str): Name of observation filter to use. - clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to - experience postprocessing. Setting to None means clip for Atari - only. - clip_actions (bool): Whether to clip action values to the range - specified by the policy action space. - env_config (dict): Config to pass to the env creator. - model_config (dict): Config to use when creating the policy model. - policy_config (dict): Config to pass to the policy. In the - multi-agent case, this config will be merged with the - per-policy configs specified by `policy`. - worker_index (int): For remote evaluators, this should be set to a - non-zero and unique value. This index is passed to created envs - through EnvContext so that envs can be configured per worker. - monitor_path (str): Write out episode stats and videos to this - directory if specified. - log_dir (str): Directory where logs can be placed. - log_level (str): Set the root log level on creation. - callbacks (dict): Dict of custom debug callbacks. - input_creator (func): Function that returns an InputReader object - for loading previous generated experiences. - input_evaluation (list): How to evaluate the policy performance. - This only makes sense to set when the input is reading offline - data. The possible values include: - - "is": the step-wise importance sampling estimator. - - "wis": the weighted step-wise is estimator. - - "simulation": run the environment in the background, but - use this data for evaluation only and never for learning. - output_creator (func): Function that returns an OutputWriter object - for saving generated experiences. - remote_worker_envs (bool): If using num_envs > 1, whether to create - those new envs in remote processes instead of in the current - process. This adds overheads, but can make sense if your envs - remote_env_batch_wait_ms (float): Timeout that remote workers - are waiting when polling environments. 0 (continue when at - least one env is ready) is a reasonable default, but optimal - value could be obtained by measuring your environment - step / reset and model inference perf. - soft_horizon (bool): Calculate rewards but don't reset the - environment when the horizon is hit. - _fake_sampler (bool): Use a fake (inf speed) sampler for testing. - """ - - global _global_evaluator - _global_evaluator = self - - if log_level: - logging.getLogger("ray.rllib").setLevel(log_level) - - if worker_index > 1: - disable_log_once_globally() # only need 1 evaluator to log - elif log_level == "DEBUG": - enable_periodic_logging() - - env_context = EnvContext(env_config or {}, worker_index) - policy_config = policy_config or {} - self.policy_config = policy_config - self.callbacks = callbacks or {} - self.worker_index = worker_index - model_config = model_config or {} - policy_mapping_fn = (policy_mapping_fn - or (lambda agent_id: DEFAULT_POLICY_ID)) - if not callable(policy_mapping_fn): - raise ValueError( - "Policy mapping function not callable. If you're using Tune, " - "make sure to escape the function with tune.function() " - "to prevent it from being evaluated as an expression.") - self.env_creator = env_creator - self.sample_batch_size = batch_steps * num_envs - self.batch_mode = batch_mode - self.compress_observations = compress_observations - self.preprocessing_enabled = True - self.last_batch = None - self._fake_sampler = _fake_sampler - - self.env = _validate_env(env_creator(env_context)) - if isinstance(self.env, MultiAgentEnv) or \ - isinstance(self.env, BaseEnv): - - def wrap(env): - return env # we can't auto-wrap these env types - elif is_atari(self.env) and \ - not model_config.get("custom_preprocessor") and \ - preprocessor_pref == "deepmind": - - # Deepmind wrappers already handle all preprocessing - self.preprocessing_enabled = False - - if clip_rewards is None: - clip_rewards = True - - def wrap(env): - env = wrap_deepmind( - env, - dim=model_config.get("dim"), - framestack=model_config.get("framestack")) - if monitor_path: - env = _monitor(env, monitor_path) - return env - else: - - def wrap(env): - if monitor_path: - env = _monitor(env, monitor_path) - return env - - self.env = wrap(self.env) - - def make_env(vector_index): - return wrap( - env_creator( - env_context.copy_with_overrides( - vector_index=vector_index, remote=remote_worker_envs))) - - self.tf_sess = None - policy_dict = _validate_and_canonicalize(policy, self.env) - self.policies_to_train = policies_to_train or list(policy_dict.keys()) - if _has_tensorflow_graph(policy_dict): - if (ray.is_initialized() - and ray.worker._mode() != ray.worker.LOCAL_MODE - and not ray.get_gpu_ids()): - logger.info("Creating policy evaluation worker {}".format( - worker_index) + - " on CPU (please ignore any CUDA init errors)") - with tf.Graph().as_default(): - if tf_session_creator: - self.tf_sess = tf_session_creator() - else: - self.tf_sess = tf.Session( - config=tf.ConfigProto( - gpu_options=tf.GPUOptions(allow_growth=True))) - with self.tf_sess.as_default(): - self.policy_map, self.preprocessors = \ - self._build_policy_map(policy_dict, policy_config) - else: - self.policy_map, self.preprocessors = self._build_policy_map( - policy_dict, policy_config) - - self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID} - if self.multiagent: - if not ((isinstance(self.env, MultiAgentEnv) - or isinstance(self.env, ExternalMultiAgentEnv)) - or isinstance(self.env, BaseEnv)): - raise ValueError( - "Have multiple policies {}, but the env ".format( - self.policy_map) + - "{} is not a subclass of BaseEnv, MultiAgentEnv or " - "ExternalMultiAgentEnv?".format(self.env)) - - self.filters = { - policy_id: get_filter(observation_filter, - policy.observation_space.shape) - for (policy_id, policy) in self.policy_map.items() - } - if self.worker_index == 0: - logger.info("Built filter map: {}".format(self.filters)) - - # Always use vector env for consistency even if num_envs = 1 - self.async_env = BaseEnv.to_base_env( - self.env, - make_env=make_env, - num_envs=num_envs, - remote_envs=remote_worker_envs, - remote_env_batch_wait_ms=remote_env_batch_wait_ms) - self.num_envs = num_envs - - if self.batch_mode == "truncate_episodes": - unroll_length = batch_steps - pack_episodes = True - elif self.batch_mode == "complete_episodes": - unroll_length = float("inf") # never cut episodes - pack_episodes = False # sampler will return 1 episode per poll - else: - raise ValueError("Unsupported batch mode: {}".format( - self.batch_mode)) - - self.io_context = IOContext(log_dir, policy_config, worker_index, self) - self.reward_estimators = [] - for method in input_evaluation: - if method == "simulation": - logger.warning( - "Requested 'simulation' input evaluation method: " - "will discard all sampler outputs and keep only metrics.") - sample_async = True - elif method == "is": - ise = ImportanceSamplingEstimator.create(self.io_context) - self.reward_estimators.append(ise) - elif method == "wis": - wise = WeightedImportanceSamplingEstimator.create( - self.io_context) - self.reward_estimators.append(wise) - else: - raise ValueError( - "Unknown evaluation method: {}".format(method)) - - if sample_async: - self.sampler = AsyncSampler( - self.async_env, - self.policy_map, - policy_mapping_fn, - self.preprocessors, - self.filters, - clip_rewards, - unroll_length, - self.callbacks, - horizon=episode_horizon, - pack=pack_episodes, - tf_sess=self.tf_sess, - clip_actions=clip_actions, - blackhole_outputs="simulation" in input_evaluation, - soft_horizon=soft_horizon) - self.sampler.start() - else: - self.sampler = SyncSampler( - self.async_env, - self.policy_map, - policy_mapping_fn, - self.preprocessors, - self.filters, - clip_rewards, - unroll_length, - self.callbacks, - horizon=episode_horizon, - pack=pack_episodes, - tf_sess=self.tf_sess, - clip_actions=clip_actions, - soft_horizon=soft_horizon) - - self.input_reader = input_creator(self.io_context) - assert isinstance(self.input_reader, InputReader), self.input_reader - self.output_writer = output_creator(self.io_context) - assert isinstance(self.output_writer, OutputWriter), self.output_writer - - logger.debug("Created evaluator with env {} ({}), policies {}".format( - self.async_env, self.env, self.policy_map)) - - @override(EvaluatorInterface) - def sample(self): - """Evaluate the current policies and return a batch of experiences. - - Return: - SampleBatch|MultiAgentBatch from evaluating the current policies. - """ - - if self._fake_sampler and self.last_batch is not None: - return self.last_batch - - if log_once("sample_start"): - logger.info("Generating sample batch of size {}".format( - self.sample_batch_size)) - - batches = [self.input_reader.next()] - steps_so_far = batches[0].count - - # In truncate_episodes mode, never pull more than 1 batch per env. - # This avoids over-running the target batch size. - if self.batch_mode == "truncate_episodes": - max_batches = self.num_envs - else: - max_batches = float("inf") - - while steps_so_far < self.sample_batch_size and len( - batches) < max_batches: - batch = self.input_reader.next() - steps_so_far += batch.count - batches.append(batch) - batch = batches[0].concat_samples(batches) - - if self.callbacks.get("on_sample_end"): - self.callbacks["on_sample_end"]({ - "evaluator": self, - "samples": batch - }) - - # Always do writes prior to compression for consistency and to allow - # for better compression inside the writer. - self.output_writer.write(batch) - - # Do off-policy estimation if needed - if self.reward_estimators: - for sub_batch in batch.split_by_episode(): - for estimator in self.reward_estimators: - estimator.process(sub_batch) - - if log_once("sample_end"): - logger.info("Completed sample batch:\n\n{}\n".format( - summarize(batch))) - - if self.compress_observations == "bulk": - batch.compress(bulk=True) - elif self.compress_observations: - batch.compress() - - if self._fake_sampler: - self.last_batch = batch - return batch - - @DeveloperAPI - @ray.method(num_return_vals=2) - def sample_with_count(self): - """Same as sample() but returns the count as a separate future.""" - batch = self.sample() - return batch, batch.count - - @override(EvaluatorInterface) - def get_weights(self, policies=None): - if policies is None: - policies = self.policy_map.keys() - return { - pid: policy.get_weights() - for pid, policy in self.policy_map.items() if pid in policies - } - - @override(EvaluatorInterface) - def set_weights(self, weights): - for pid, w in weights.items(): - self.policy_map[pid].set_weights(w) - - @override(EvaluatorInterface) - def compute_gradients(self, samples): - if log_once("compute_gradients"): - logger.info("Compute gradients on:\n\n{}\n".format( - summarize(samples))) - if isinstance(samples, MultiAgentBatch): - grad_out, info_out = {}, {} - if self.tf_sess is not None: - builder = TFRunBuilder(self.tf_sess, "compute_gradients") - for pid, batch in samples.policy_batches.items(): - if pid not in self.policies_to_train: - continue - grad_out[pid], info_out[pid] = ( - self.policy_map[pid]._build_compute_gradients( - builder, batch)) - grad_out = {k: builder.get(v) for k, v in grad_out.items()} - info_out = {k: builder.get(v) for k, v in info_out.items()} - else: - for pid, batch in samples.policy_batches.items(): - if pid not in self.policies_to_train: - continue - grad_out[pid], info_out[pid] = ( - self.policy_map[pid].compute_gradients(batch)) - else: - grad_out, info_out = ( - self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples)) - info_out["batch_count"] = samples.count - if log_once("grad_out"): - logger.info("Compute grad info:\n\n{}\n".format( - summarize(info_out))) - return grad_out, info_out - - @override(EvaluatorInterface) - def apply_gradients(self, grads): - if log_once("apply_gradients"): - logger.info("Apply gradients:\n\n{}\n".format(summarize(grads))) - if isinstance(grads, dict): - if self.tf_sess is not None: - builder = TFRunBuilder(self.tf_sess, "apply_gradients") - outputs = { - pid: self.policy_map[pid]._build_apply_gradients( - builder, grad) - for pid, grad in grads.items() - } - return {k: builder.get(v) for k, v in outputs.items()} - else: - return { - pid: self.policy_map[pid].apply_gradients(g) - for pid, g in grads.items() - } - else: - return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads) - - @override(EvaluatorInterface) - def learn_on_batch(self, samples): - if log_once("learn_on_batch"): - logger.info( - "Training on concatenated sample batches:\n\n{}\n".format( - summarize(samples))) - if isinstance(samples, MultiAgentBatch): - info_out = {} - to_fetch = {} - if self.tf_sess is not None: - builder = TFRunBuilder(self.tf_sess, "learn_on_batch") - else: - builder = None - for pid, batch in samples.policy_batches.items(): - if pid not in self.policies_to_train: - continue - policy = self.policy_map[pid] - if builder and hasattr(policy, "_build_learn_on_batch"): - to_fetch[pid] = policy._build_learn_on_batch( - builder, batch) - else: - info_out[pid] = policy.learn_on_batch(batch) - info_out.update({k: builder.get(v) for k, v in to_fetch.items()}) - else: - info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch( - samples) - if log_once("learn_out"): - logger.info("Training output:\n\n{}\n".format(summarize(info_out))) - return info_out - - @DeveloperAPI - def get_metrics(self): - """Returns a list of new RolloutMetric objects from evaluation.""" - - out = self.sampler.get_metrics() - for m in self.reward_estimators: - out.extend(m.get_metrics()) - return out - - @DeveloperAPI - def foreach_env(self, func): - """Apply the given function to each underlying env instance.""" - - envs = self.async_env.get_unwrapped() - if not envs: - return [func(self.async_env)] - else: - return [func(e) for e in envs] - - @DeveloperAPI - def get_policy(self, policy_id=DEFAULT_POLICY_ID): - """Return policy for the specified id, or None. - - Arguments: - policy_id (str): id of policy to return. - """ - - return self.policy_map.get(policy_id) - - @DeveloperAPI - def for_policy(self, func, policy_id=DEFAULT_POLICY_ID): - """Apply the given function to the specified policy.""" - - return func(self.policy_map[policy_id]) - - @DeveloperAPI - def foreach_policy(self, func): - """Apply the given function to each (policy, policy_id) tuple.""" - - return [func(policy, pid) for pid, policy in self.policy_map.items()] - - @DeveloperAPI - def foreach_trainable_policy(self, func): - """Apply the given function to each (policy, policy_id) tuple. - - This only applies func to policies in `self.policies_to_train`.""" - - return [ - func(policy, pid) for pid, policy in self.policy_map.items() - if pid in self.policies_to_train - ] - - @DeveloperAPI - def sync_filters(self, new_filters): - """Changes self's filter to given and rebases any accumulated delta. - - Args: - new_filters (dict): Filters with new state to update local copy. - """ - assert all(k in new_filters for k in self.filters) - for k in self.filters: - self.filters[k].sync(new_filters[k]) - - @DeveloperAPI - def get_filters(self, flush_after=False): - """Returns a snapshot of filters. - - Args: - flush_after (bool): Clears the filter buffer state. - - Returns: - return_filters (dict): Dict for serializable filters - """ - return_filters = {} - for k, f in self.filters.items(): - return_filters[k] = f.as_serializable() - if flush_after: - f.clear_buffer() - return return_filters - - @DeveloperAPI - def save(self): - filters = self.get_filters(flush_after=True) - state = { - pid: self.policy_map[pid].get_state() - for pid in self.policy_map - } - return pickle.dumps({"filters": filters, "state": state}) - - @DeveloperAPI - def restore(self, objs): - objs = pickle.loads(objs) - self.sync_filters(objs["filters"]) - for pid, state in objs["state"].items(): - self.policy_map[pid].set_state(state) - - @DeveloperAPI - def set_global_vars(self, global_vars): - self.foreach_policy(lambda p, _: p.on_global_var_update(global_vars)) - - @DeveloperAPI - def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID): - self.policy_map[policy_id].export_model(export_dir) - - @DeveloperAPI - def export_policy_checkpoint(self, - export_dir, - filename_prefix="model", - policy_id=DEFAULT_POLICY_ID): - self.policy_map[policy_id].export_checkpoint(export_dir, - filename_prefix) - - @DeveloperAPI - def stop(self): - self.async_env.stop() - - def _build_policy_map(self, policy_dict, policy_config): - policy_map = {} - preprocessors = {} - for name, (cls, obs_space, act_space, - conf) in sorted(policy_dict.items()): - logger.debug("Creating policy for {}".format(name)) - merged_conf = merge_dicts(policy_config, conf) - if self.preprocessing_enabled: - preprocessor = ModelCatalog.get_preprocessor_for_space( - obs_space, merged_conf.get("model")) - preprocessors[name] = preprocessor - obs_space = preprocessor.observation_space - else: - preprocessors[name] = NoPreprocessor(obs_space) - if isinstance(obs_space, gym.spaces.Dict) or \ - isinstance(obs_space, gym.spaces.Tuple): - raise ValueError( - "Found raw Tuple|Dict space as input to policy. " - "Please preprocess these observations with a " - "Tuple|DictFlatteningPreprocessor.") - if tf: - with tf.variable_scope(name): - policy_map[name] = cls(obs_space, act_space, merged_conf) - else: - policy_map[name] = cls(obs_space, act_space, merged_conf) - if self.worker_index == 0: - logger.info("Built policy map: {}".format(policy_map)) - logger.info("Built preprocessor map: {}".format(preprocessors)) - return policy_map, preprocessors - - def __del__(self): - if hasattr(self, "sampler") and isinstance(self.sampler, AsyncSampler): - self.sampler.shutdown = True - - -def _validate_and_canonicalize(policy, env): - if isinstance(policy, dict): - _validate_multiagent_config(policy) - return policy - elif not issubclass(policy, Policy): - raise ValueError("policy must be a rllib.Policy class") - else: - if (isinstance(env, MultiAgentEnv) - and not hasattr(env, "observation_space")): - raise ValueError( - "MultiAgentEnv must have observation_space defined if run " - "in a single-agent configuration.") - return { - DEFAULT_POLICY_ID: (policy, env.observation_space, - env.action_space, {}) - } - - -def _validate_multiagent_config(policy, allow_none_graph=False): - for k, v in policy.items(): - if not isinstance(k, str): - raise ValueError("policy keys must be strs, got {}".format( - type(k))) - if not isinstance(v, tuple) or len(v) != 4: - raise ValueError( - "policy values must be tuples of " - "(cls, obs_space, action_space, config), got {}".format(v)) - if allow_none_graph and v[0] is None: - pass - elif not issubclass(v[0], Policy): - raise ValueError("policy tuple value 0 must be a rllib.Policy " - "class or None, got {}".format(v[0])) - if not isinstance(v[1], gym.Space): - raise ValueError( - "policy tuple value 1 (observation_space) must be a " - "gym.Space, got {}".format(type(v[1]))) - if not isinstance(v[2], gym.Space): - raise ValueError("policy tuple value 2 (action_space) must be a " - "gym.Space, got {}".format(type(v[2]))) - if not isinstance(v[3], dict): - raise ValueError("policy tuple value 3 (config) must be a dict, " - "got {}".format(type(v[3]))) - - -def _validate_env(env): - # allow this as a special case (assumed gym.Env) - if hasattr(env, "observation_space") and hasattr(env, "action_space"): - return env - - allowed_types = [gym.Env, MultiAgentEnv, ExternalEnv, VectorEnv, BaseEnv] - if not any(isinstance(env, tpe) for tpe in allowed_types): - raise ValueError( - "Returned env should be an instance of gym.Env, MultiAgentEnv, " - "ExternalEnv, VectorEnv, or BaseEnv. The provided env creator " - "function returned {} ({}).".format(env, type(env))) - return env - - -def _monitor(env, path): - return gym.wrappers.Monitor(env, path, resume=True) - - -def _has_tensorflow_graph(policy_dict): - for policy, _, _, _ in policy_dict.values(): - if issubclass(policy, TFPolicy): - return True - return False +PolicyEvaluator = renamed_class( + RolloutWorker, old_name="rllib.evaluation.PolicyEvaluator") diff --git a/python/ray/rllib/evaluation/rollout_worker.py b/python/ray/rllib/evaluation/rollout_worker.py new file mode 100644 index 000000000..3be01a429 --- /dev/null +++ b/python/ray/rllib/evaluation/rollout_worker.py @@ -0,0 +1,794 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import gym +import logging +import pickle + +import ray +from ray.rllib.env.atari_wrappers import wrap_deepmind, is_atari +from ray.rllib.env.base_env import BaseEnv +from ray.rllib.env.env_context import EnvContext +from ray.rllib.env.external_env import ExternalEnv +from ray.rllib.env.multi_agent_env import MultiAgentEnv +from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv +from ray.rllib.env.vector_env import VectorEnv +from ray.rllib.evaluation.interface import EvaluatorInterface +from ray.rllib.evaluation.sampler import AsyncSampler, SyncSampler +from ray.rllib.policy.sample_batch import MultiAgentBatch, DEFAULT_POLICY_ID +from ray.rllib.policy.policy import Policy +from ray.rllib.policy.tf_policy import TFPolicy +from ray.rllib.offline import NoopOutput, IOContext, OutputWriter, InputReader +from ray.rllib.offline.is_estimator import ImportanceSamplingEstimator +from ray.rllib.offline.wis_estimator import WeightedImportanceSamplingEstimator +from ray.rllib.models import ModelCatalog +from ray.rllib.models.preprocessors import NoPreprocessor +from ray.rllib.utils import merge_dicts +from ray.rllib.utils.annotations import override, DeveloperAPI +from ray.rllib.utils.debug import disable_log_once_globally, log_once, \ + summarize, enable_periodic_logging +from ray.rllib.utils.filter import get_filter +from ray.rllib.utils.tf_run_builder import TFRunBuilder +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() +logger = logging.getLogger(__name__) + +# Handle to the current rollout worker, which will be set to the most recently +# created RolloutWorker in this process. This can be helpful to access in +# custom env or policy classes for debugging or advanced use cases. +_global_worker = None + + +@DeveloperAPI +def get_global_worker(): + """Returns a handle to the active rollout worker in this process.""" + + global _global_worker + return _global_worker + + +@DeveloperAPI +class RolloutWorker(EvaluatorInterface): + """Common experience collection class. + + This class wraps a policy instance and an environment class to + collect experiences from the environment. You can create many replicas of + this class as Ray actors to scale RL training. + + This class supports vectorized and multi-agent policy evaluation (e.g., + VectorEnv, MultiAgentEnv, etc.) + + Examples: + >>> # Create a rollout worker and using it to collect experiences. + >>> worker = RolloutWorker( + ... env_creator=lambda _: gym.make("CartPole-v0"), + ... policy=PGTFPolicy) + >>> print(worker.sample()) + SampleBatch({ + "obs": [[...]], "actions": [[...]], "rewards": [[...]], + "dones": [[...]], "new_obs": [[...]]}) + + >>> # Creating a multi-agent rollout worker + >>> worker = RolloutWorker( + ... env_creator=lambda _: MultiAgentTrafficGrid(num_cars=25), + ... policies={ + ... # Use an ensemble of two policies for car agents + ... "car_policy1": + ... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.99}), + ... "car_policy2": + ... (PGTFPolicy, Box(...), Discrete(...), {"gamma": 0.95}), + ... # Use a single shared policy for all traffic lights + ... "traffic_light_policy": + ... (PGTFPolicy, Box(...), Discrete(...), {}), + ... }, + ... policy_mapping_fn=lambda agent_id: + ... random.choice(["car_policy1", "car_policy2"]) + ... if agent_id.startswith("car_") else "traffic_light_policy") + >>> print(worker.sample()) + MultiAgentBatch({ + "car_policy1": SampleBatch(...), + "car_policy2": SampleBatch(...), + "traffic_light_policy": SampleBatch(...)}) + """ + + @DeveloperAPI + @classmethod + def as_remote(cls, num_cpus=None, num_gpus=None, resources=None): + return ray.remote( + num_cpus=num_cpus, num_gpus=num_gpus, resources=resources)(cls) + + @DeveloperAPI + def __init__(self, + env_creator, + policy, + policy_mapping_fn=None, + policies_to_train=None, + tf_session_creator=None, + batch_steps=100, + batch_mode="truncate_episodes", + episode_horizon=None, + preprocessor_pref="deepmind", + sample_async=False, + compress_observations=False, + num_envs=1, + observation_filter="NoFilter", + clip_rewards=None, + clip_actions=True, + env_config=None, + model_config=None, + policy_config=None, + worker_index=0, + monitor_path=None, + log_dir=None, + log_level=None, + callbacks=None, + input_creator=lambda ioctx: ioctx.default_sampler_input(), + input_evaluation=frozenset([]), + output_creator=lambda ioctx: NoopOutput(), + remote_worker_envs=False, + remote_env_batch_wait_ms=0, + soft_horizon=False, + _fake_sampler=False): + """Initialize a rollout worker. + + Arguments: + env_creator (func): Function that returns a gym.Env given an + EnvContext wrapped configuration. + policy (class|dict): Either a class implementing + Policy, or a dictionary of policy id strings to + (Policy, obs_space, action_space, config) tuples. If a + dict is specified, then we are in multi-agent mode and a + policy_mapping_fn should also be set. + policy_mapping_fn (func): A function that maps agent ids to + policy ids in multi-agent mode. This function will be called + each time a new agent appears in an episode, to bind that agent + to a policy for the duration of the episode. + policies_to_train (list): Optional whitelist of policies to train, + or None for all policies. + tf_session_creator (func): A function that returns a TF session. + This is optional and only useful with TFPolicy. + batch_steps (int): The target number of env transitions to include + in each sample batch returned from this worker. + batch_mode (str): One of the following batch modes: + "truncate_episodes": Each call to sample() will return a batch + of at most `batch_steps * num_envs` in size. The batch will + be exactly `batch_steps * num_envs` in size if + postprocessing does not change batch sizes. Episodes may be + truncated in order to meet this size requirement. + "complete_episodes": Each call to sample() will return a batch + of at least `batch_steps * num_envs` in size. Episodes will + not be truncated, but multiple episodes may be packed + within one batch to meet the batch size. Note that when + `num_envs > 1`, episode steps will be buffered until the + episode completes, and hence batches may contain + significant amounts of off-policy data. + episode_horizon (int): Whether to stop episodes at this horizon. + preprocessor_pref (str): Whether to prefer RLlib preprocessors + ("rllib") or deepmind ("deepmind") when applicable. + sample_async (bool): Whether to compute samples asynchronously in + the background, which improves throughput but can cause samples + to be slightly off-policy. + compress_observations (bool): If true, compress the observations. + They can be decompressed with rllib/utils/compression. + num_envs (int): If more than one, will create multiple envs + and vectorize the computation of actions. This has no effect if + if the env already implements VectorEnv. + observation_filter (str): Name of observation filter to use. + clip_rewards (bool): Whether to clip rewards to [-1, 1] prior to + experience postprocessing. Setting to None means clip for Atari + only. + clip_actions (bool): Whether to clip action values to the range + specified by the policy action space. + env_config (dict): Config to pass to the env creator. + model_config (dict): Config to use when creating the policy model. + policy_config (dict): Config to pass to the policy. In the + multi-agent case, this config will be merged with the + per-policy configs specified by `policy`. + worker_index (int): For remote workers, this should be set to a + non-zero and unique value. This index is passed to created envs + through EnvContext so that envs can be configured per worker. + monitor_path (str): Write out episode stats and videos to this + directory if specified. + log_dir (str): Directory where logs can be placed. + log_level (str): Set the root log level on creation. + callbacks (dict): Dict of custom debug callbacks. + input_creator (func): Function that returns an InputReader object + for loading previous generated experiences. + input_evaluation (list): How to evaluate the policy performance. + This only makes sense to set when the input is reading offline + data. The possible values include: + - "is": the step-wise importance sampling estimator. + - "wis": the weighted step-wise is estimator. + - "simulation": run the environment in the background, but + use this data for evaluation only and never for learning. + output_creator (func): Function that returns an OutputWriter object + for saving generated experiences. + remote_worker_envs (bool): If using num_envs > 1, whether to create + those new envs in remote processes instead of in the current + process. This adds overheads, but can make sense if your envs + remote_env_batch_wait_ms (float): Timeout that remote workers + are waiting when polling environments. 0 (continue when at + least one env is ready) is a reasonable default, but optimal + value could be obtained by measuring your environment + step / reset and model inference perf. + soft_horizon (bool): Calculate rewards but don't reset the + environment when the horizon is hit. + _fake_sampler (bool): Use a fake (inf speed) sampler for testing. + """ + + global _global_worker + _global_worker = self + + if log_level: + logging.getLogger("ray.rllib").setLevel(log_level) + + if worker_index > 1: + disable_log_once_globally() # only need 1 worker to log + elif log_level == "DEBUG": + enable_periodic_logging() + + env_context = EnvContext(env_config or {}, worker_index) + policy_config = policy_config or {} + self.policy_config = policy_config + self.callbacks = callbacks or {} + self.worker_index = worker_index + model_config = model_config or {} + policy_mapping_fn = (policy_mapping_fn + or (lambda agent_id: DEFAULT_POLICY_ID)) + if not callable(policy_mapping_fn): + raise ValueError( + "Policy mapping function not callable. If you're using Tune, " + "make sure to escape the function with tune.function() " + "to prevent it from being evaluated as an expression.") + self.env_creator = env_creator + self.sample_batch_size = batch_steps * num_envs + self.batch_mode = batch_mode + self.compress_observations = compress_observations + self.preprocessing_enabled = True + self.last_batch = None + self._fake_sampler = _fake_sampler + + self.env = _validate_env(env_creator(env_context)) + if isinstance(self.env, MultiAgentEnv) or \ + isinstance(self.env, BaseEnv): + + def wrap(env): + return env # we can't auto-wrap these env types + elif is_atari(self.env) and \ + not model_config.get("custom_preprocessor") and \ + preprocessor_pref == "deepmind": + + # Deepmind wrappers already handle all preprocessing + self.preprocessing_enabled = False + + if clip_rewards is None: + clip_rewards = True + + def wrap(env): + env = wrap_deepmind( + env, + dim=model_config.get("dim"), + framestack=model_config.get("framestack")) + if monitor_path: + env = _monitor(env, monitor_path) + return env + else: + + def wrap(env): + if monitor_path: + env = _monitor(env, monitor_path) + return env + + self.env = wrap(self.env) + + def make_env(vector_index): + return wrap( + env_creator( + env_context.copy_with_overrides( + vector_index=vector_index, remote=remote_worker_envs))) + + self.tf_sess = None + policy_dict = _validate_and_canonicalize(policy, self.env) + self.policies_to_train = policies_to_train or list(policy_dict.keys()) + if _has_tensorflow_graph(policy_dict): + if (ray.is_initialized() + and ray.worker._mode() != ray.worker.LOCAL_MODE + and not ray.get_gpu_ids()): + logger.info("Creating policy evaluation worker {}".format( + worker_index) + + " on CPU (please ignore any CUDA init errors)") + with tf.Graph().as_default(): + if tf_session_creator: + self.tf_sess = tf_session_creator() + else: + self.tf_sess = tf.Session( + config=tf.ConfigProto( + gpu_options=tf.GPUOptions(allow_growth=True))) + with self.tf_sess.as_default(): + self.policy_map, self.preprocessors = \ + self._build_policy_map(policy_dict, policy_config) + else: + self.policy_map, self.preprocessors = self._build_policy_map( + policy_dict, policy_config) + + self.multiagent = set(self.policy_map.keys()) != {DEFAULT_POLICY_ID} + if self.multiagent: + if not ((isinstance(self.env, MultiAgentEnv) + or isinstance(self.env, ExternalMultiAgentEnv)) + or isinstance(self.env, BaseEnv)): + raise ValueError( + "Have multiple policies {}, but the env ".format( + self.policy_map) + + "{} is not a subclass of BaseEnv, MultiAgentEnv or " + "ExternalMultiAgentEnv?".format(self.env)) + + self.filters = { + policy_id: get_filter(observation_filter, + policy.observation_space.shape) + for (policy_id, policy) in self.policy_map.items() + } + if self.worker_index == 0: + logger.info("Built filter map: {}".format(self.filters)) + + # Always use vector env for consistency even if num_envs = 1 + self.async_env = BaseEnv.to_base_env( + self.env, + make_env=make_env, + num_envs=num_envs, + remote_envs=remote_worker_envs, + remote_env_batch_wait_ms=remote_env_batch_wait_ms) + self.num_envs = num_envs + + if self.batch_mode == "truncate_episodes": + unroll_length = batch_steps + pack_episodes = True + elif self.batch_mode == "complete_episodes": + unroll_length = float("inf") # never cut episodes + pack_episodes = False # sampler will return 1 episode per poll + else: + raise ValueError("Unsupported batch mode: {}".format( + self.batch_mode)) + + self.io_context = IOContext(log_dir, policy_config, worker_index, self) + self.reward_estimators = [] + for method in input_evaluation: + if method == "simulation": + logger.warning( + "Requested 'simulation' input evaluation method: " + "will discard all sampler outputs and keep only metrics.") + sample_async = True + elif method == "is": + ise = ImportanceSamplingEstimator.create(self.io_context) + self.reward_estimators.append(ise) + elif method == "wis": + wise = WeightedImportanceSamplingEstimator.create( + self.io_context) + self.reward_estimators.append(wise) + else: + raise ValueError( + "Unknown evaluation method: {}".format(method)) + + if sample_async: + self.sampler = AsyncSampler( + self.async_env, + self.policy_map, + policy_mapping_fn, + self.preprocessors, + self.filters, + clip_rewards, + unroll_length, + self.callbacks, + horizon=episode_horizon, + pack=pack_episodes, + tf_sess=self.tf_sess, + clip_actions=clip_actions, + blackhole_outputs="simulation" in input_evaluation, + soft_horizon=soft_horizon) + self.sampler.start() + else: + self.sampler = SyncSampler( + self.async_env, + self.policy_map, + policy_mapping_fn, + self.preprocessors, + self.filters, + clip_rewards, + unroll_length, + self.callbacks, + horizon=episode_horizon, + pack=pack_episodes, + tf_sess=self.tf_sess, + clip_actions=clip_actions, + soft_horizon=soft_horizon) + + self.input_reader = input_creator(self.io_context) + assert isinstance(self.input_reader, InputReader), self.input_reader + self.output_writer = output_creator(self.io_context) + assert isinstance(self.output_writer, OutputWriter), self.output_writer + + logger.debug( + "Created rollout worker with env {} ({}), policies {}".format( + self.async_env, self.env, self.policy_map)) + + @override(EvaluatorInterface) + def sample(self): + """Evaluate the current policies and return a batch of experiences. + + Return: + SampleBatch|MultiAgentBatch from evaluating the current policies. + """ + + if self._fake_sampler and self.last_batch is not None: + return self.last_batch + + if log_once("sample_start"): + logger.info("Generating sample batch of size {}".format( + self.sample_batch_size)) + + batches = [self.input_reader.next()] + steps_so_far = batches[0].count + + # In truncate_episodes mode, never pull more than 1 batch per env. + # This avoids over-running the target batch size. + if self.batch_mode == "truncate_episodes": + max_batches = self.num_envs + else: + max_batches = float("inf") + + while steps_so_far < self.sample_batch_size and len( + batches) < max_batches: + batch = self.input_reader.next() + steps_so_far += batch.count + batches.append(batch) + batch = batches[0].concat_samples(batches) + + if self.callbacks.get("on_sample_end"): + self.callbacks["on_sample_end"]({"worker": self, "samples": batch}) + + # Always do writes prior to compression for consistency and to allow + # for better compression inside the writer. + self.output_writer.write(batch) + + # Do off-policy estimation if needed + if self.reward_estimators: + for sub_batch in batch.split_by_episode(): + for estimator in self.reward_estimators: + estimator.process(sub_batch) + + if log_once("sample_end"): + logger.info("Completed sample batch:\n\n{}\n".format( + summarize(batch))) + + if self.compress_observations == "bulk": + batch.compress(bulk=True) + elif self.compress_observations: + batch.compress() + + if self._fake_sampler: + self.last_batch = batch + return batch + + @DeveloperAPI + @ray.method(num_return_vals=2) + def sample_with_count(self): + """Same as sample() but returns the count as a separate future.""" + batch = self.sample() + return batch, batch.count + + @override(EvaluatorInterface) + def get_weights(self, policies=None): + if policies is None: + policies = self.policy_map.keys() + return { + pid: policy.get_weights() + for pid, policy in self.policy_map.items() if pid in policies + } + + @override(EvaluatorInterface) + def set_weights(self, weights): + for pid, w in weights.items(): + self.policy_map[pid].set_weights(w) + + @override(EvaluatorInterface) + def compute_gradients(self, samples): + if log_once("compute_gradients"): + logger.info("Compute gradients on:\n\n{}\n".format( + summarize(samples))) + if isinstance(samples, MultiAgentBatch): + grad_out, info_out = {}, {} + if self.tf_sess is not None: + builder = TFRunBuilder(self.tf_sess, "compute_gradients") + for pid, batch in samples.policy_batches.items(): + if pid not in self.policies_to_train: + continue + grad_out[pid], info_out[pid] = ( + self.policy_map[pid]._build_compute_gradients( + builder, batch)) + grad_out = {k: builder.get(v) for k, v in grad_out.items()} + info_out = {k: builder.get(v) for k, v in info_out.items()} + else: + for pid, batch in samples.policy_batches.items(): + if pid not in self.policies_to_train: + continue + grad_out[pid], info_out[pid] = ( + self.policy_map[pid].compute_gradients(batch)) + else: + grad_out, info_out = ( + self.policy_map[DEFAULT_POLICY_ID].compute_gradients(samples)) + info_out["batch_count"] = samples.count + if log_once("grad_out"): + logger.info("Compute grad info:\n\n{}\n".format( + summarize(info_out))) + return grad_out, info_out + + @override(EvaluatorInterface) + def apply_gradients(self, grads): + if log_once("apply_gradients"): + logger.info("Apply gradients:\n\n{}\n".format(summarize(grads))) + if isinstance(grads, dict): + if self.tf_sess is not None: + builder = TFRunBuilder(self.tf_sess, "apply_gradients") + outputs = { + pid: self.policy_map[pid]._build_apply_gradients( + builder, grad) + for pid, grad in grads.items() + } + return {k: builder.get(v) for k, v in outputs.items()} + else: + return { + pid: self.policy_map[pid].apply_gradients(g) + for pid, g in grads.items() + } + else: + return self.policy_map[DEFAULT_POLICY_ID].apply_gradients(grads) + + @override(EvaluatorInterface) + def learn_on_batch(self, samples): + if log_once("learn_on_batch"): + logger.info( + "Training on concatenated sample batches:\n\n{}\n".format( + summarize(samples))) + if isinstance(samples, MultiAgentBatch): + info_out = {} + to_fetch = {} + if self.tf_sess is not None: + builder = TFRunBuilder(self.tf_sess, "learn_on_batch") + else: + builder = None + for pid, batch in samples.policy_batches.items(): + if pid not in self.policies_to_train: + continue + policy = self.policy_map[pid] + if builder and hasattr(policy, "_build_learn_on_batch"): + to_fetch[pid] = policy._build_learn_on_batch( + builder, batch) + else: + info_out[pid] = policy.learn_on_batch(batch) + info_out.update({k: builder.get(v) for k, v in to_fetch.items()}) + else: + info_out = self.policy_map[DEFAULT_POLICY_ID].learn_on_batch( + samples) + if log_once("learn_out"): + logger.info("Training output:\n\n{}\n".format(summarize(info_out))) + return info_out + + @DeveloperAPI + def get_metrics(self): + """Returns a list of new RolloutMetric objects from evaluation.""" + + out = self.sampler.get_metrics() + for m in self.reward_estimators: + out.extend(m.get_metrics()) + return out + + @DeveloperAPI + def foreach_env(self, func): + """Apply the given function to each underlying env instance.""" + + envs = self.async_env.get_unwrapped() + if not envs: + return [func(self.async_env)] + else: + return [func(e) for e in envs] + + @DeveloperAPI + def get_policy(self, policy_id=DEFAULT_POLICY_ID): + """Return policy for the specified id, or None. + + Arguments: + policy_id (str): id of policy to return. + """ + + return self.policy_map.get(policy_id) + + @DeveloperAPI + def for_policy(self, func, policy_id=DEFAULT_POLICY_ID): + """Apply the given function to the specified policy.""" + + return func(self.policy_map[policy_id]) + + @DeveloperAPI + def foreach_policy(self, func): + """Apply the given function to each (policy, policy_id) tuple.""" + + return [func(policy, pid) for pid, policy in self.policy_map.items()] + + @DeveloperAPI + def foreach_trainable_policy(self, func): + """Apply the given function to each (policy, policy_id) tuple. + + This only applies func to policies in `self.policies_to_train`.""" + + return [ + func(policy, pid) for pid, policy in self.policy_map.items() + if pid in self.policies_to_train + ] + + @DeveloperAPI + def sync_filters(self, new_filters): + """Changes self's filter to given and rebases any accumulated delta. + + Args: + new_filters (dict): Filters with new state to update local copy. + """ + assert all(k in new_filters for k in self.filters) + for k in self.filters: + self.filters[k].sync(new_filters[k]) + + @DeveloperAPI + def get_filters(self, flush_after=False): + """Returns a snapshot of filters. + + Args: + flush_after (bool): Clears the filter buffer state. + + Returns: + return_filters (dict): Dict for serializable filters + """ + return_filters = {} + for k, f in self.filters.items(): + return_filters[k] = f.as_serializable() + if flush_after: + f.clear_buffer() + return return_filters + + @DeveloperAPI + def save(self): + filters = self.get_filters(flush_after=True) + state = { + pid: self.policy_map[pid].get_state() + for pid in self.policy_map + } + return pickle.dumps({"filters": filters, "state": state}) + + @DeveloperAPI + def restore(self, objs): + objs = pickle.loads(objs) + self.sync_filters(objs["filters"]) + for pid, state in objs["state"].items(): + self.policy_map[pid].set_state(state) + + @DeveloperAPI + def set_global_vars(self, global_vars): + self.foreach_policy(lambda p, _: p.on_global_var_update(global_vars)) + + @DeveloperAPI + def export_policy_model(self, export_dir, policy_id=DEFAULT_POLICY_ID): + self.policy_map[policy_id].export_model(export_dir) + + @DeveloperAPI + def export_policy_checkpoint(self, + export_dir, + filename_prefix="model", + policy_id=DEFAULT_POLICY_ID): + self.policy_map[policy_id].export_checkpoint(export_dir, + filename_prefix) + + @DeveloperAPI + def stop(self): + self.async_env.stop() + + def _build_policy_map(self, policy_dict, policy_config): + policy_map = {} + preprocessors = {} + for name, (cls, obs_space, act_space, + conf) in sorted(policy_dict.items()): + logger.debug("Creating policy for {}".format(name)) + merged_conf = merge_dicts(policy_config, conf) + if self.preprocessing_enabled: + preprocessor = ModelCatalog.get_preprocessor_for_space( + obs_space, merged_conf.get("model")) + preprocessors[name] = preprocessor + obs_space = preprocessor.observation_space + else: + preprocessors[name] = NoPreprocessor(obs_space) + if isinstance(obs_space, gym.spaces.Dict) or \ + isinstance(obs_space, gym.spaces.Tuple): + raise ValueError( + "Found raw Tuple|Dict space as input to policy. " + "Please preprocess these observations with a " + "Tuple|DictFlatteningPreprocessor.") + if tf: + with tf.variable_scope(name): + policy_map[name] = cls(obs_space, act_space, merged_conf) + else: + policy_map[name] = cls(obs_space, act_space, merged_conf) + if self.worker_index == 0: + logger.info("Built policy map: {}".format(policy_map)) + logger.info("Built preprocessor map: {}".format(preprocessors)) + return policy_map, preprocessors + + def __del__(self): + if hasattr(self, "sampler") and isinstance(self.sampler, AsyncSampler): + self.sampler.shutdown = True + + +def _validate_and_canonicalize(policy, env): + if isinstance(policy, dict): + _validate_multiagent_config(policy) + return policy + elif not issubclass(policy, Policy): + raise ValueError("policy must be a rllib.Policy class") + else: + if (isinstance(env, MultiAgentEnv) + and not hasattr(env, "observation_space")): + raise ValueError( + "MultiAgentEnv must have observation_space defined if run " + "in a single-agent configuration.") + return { + DEFAULT_POLICY_ID: (policy, env.observation_space, + env.action_space, {}) + } + + +def _validate_multiagent_config(policy, allow_none_graph=False): + for k, v in policy.items(): + if not isinstance(k, str): + raise ValueError("policy keys must be strs, got {}".format( + type(k))) + if not isinstance(v, tuple) or len(v) != 4: + raise ValueError( + "policy values must be tuples of " + "(cls, obs_space, action_space, config), got {}".format(v)) + if allow_none_graph and v[0] is None: + pass + elif not issubclass(v[0], Policy): + raise ValueError("policy tuple value 0 must be a rllib.Policy " + "class or None, got {}".format(v[0])) + if not isinstance(v[1], gym.Space): + raise ValueError( + "policy tuple value 1 (observation_space) must be a " + "gym.Space, got {}".format(type(v[1]))) + if not isinstance(v[2], gym.Space): + raise ValueError("policy tuple value 2 (action_space) must be a " + "gym.Space, got {}".format(type(v[2]))) + if not isinstance(v[3], dict): + raise ValueError("policy tuple value 3 (config) must be a dict, " + "got {}".format(type(v[3]))) + + +def _validate_env(env): + # allow this as a special case (assumed gym.Env) + if hasattr(env, "observation_space") and hasattr(env, "action_space"): + return env + + allowed_types = [gym.Env, MultiAgentEnv, ExternalEnv, VectorEnv, BaseEnv] + if not any(isinstance(env, tpe) for tpe in allowed_types): + raise ValueError( + "Returned env should be an instance of gym.Env, MultiAgentEnv, " + "ExternalEnv, VectorEnv, or BaseEnv. The provided env creator " + "function returned {} ({}).".format(env, type(env))) + return env + + +def _monitor(env, path): + return gym.wrappers.Monitor(env, path, resume=True) + + +def _has_tensorflow_graph(policy_dict): + for policy, _, _, _ in policy_dict.values(): + if issubclass(policy, TFPolicy): + return True + return False diff --git a/python/ray/rllib/evaluation/worker_set.py b/python/ray/rllib/evaluation/worker_set.py new file mode 100644 index 000000000..90d3c13c2 --- /dev/null +++ b/python/ray/rllib/evaluation/worker_set.py @@ -0,0 +1,214 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import logging +from types import FunctionType + +from ray.rllib.utils.annotations import DeveloperAPI +from ray.rllib.evaluation.rollout_worker import RolloutWorker, \ + _validate_multiagent_config +from ray.rllib.offline import NoopOutput, JsonReader, MixedInput, JsonWriter, \ + ShuffledInput +from ray.rllib.utils import merge_dicts, try_import_tf +from ray.rllib.utils.memory import ray_get_and_free + +tf = try_import_tf() + +logger = logging.getLogger(__name__) + + +@DeveloperAPI +class WorkerSet(object): + """Represents a set of RolloutWorkers. + + There must be one local worker copy, and zero or more remote workers. + """ + + def __init__(self, + env_creator, + policy, + trainer_config=None, + num_workers=0, + logdir=None, + _setup=True): + """Create a new WorkerSet and initialize its workers. + + Arguments: + env_creator (func): Function that returns env given env config. + policy (cls): rllib.policy.Policy class. + trainer_config (dict): Optional dict that extends the common + config of the Trainer class. + num_workers (int): Number of remote rollout workers to create. + logdir (str): Optional logging directory for workers. + _setup (bool): Whether to setup workers. This is only for testing. + """ + + if not trainer_config: + from ray.rllib.agents.trainer import COMMON_CONFIG + trainer_config = COMMON_CONFIG + + self._env_creator = env_creator + self._policy = policy + self._remote_config = trainer_config + self._num_workers = num_workers + self._logdir = logdir + + if _setup: + self._local_config = merge_dicts( + trainer_config, + {"tf_session_args": trainer_config["local_tf_session_args"]}) + + # Always create a local worker + self._local_worker = self._make_worker( + RolloutWorker, env_creator, policy, 0, self._local_config) + + # Create a number of remote workers + self._remote_workers = [] + self.add_workers(num_workers) + + def local_worker(self): + """Return the local rollout worker.""" + return self._local_worker + + def remote_workers(self): + """Return a list of remote rollout workers.""" + return self._remote_workers + + def add_workers(self, num_workers): + """Create and add a number of remote workers to this worker set.""" + remote_args = { + "num_cpus": self._remote_config["num_cpus_per_worker"], + "num_gpus": self._remote_config["num_gpus_per_worker"], + "resources": self._remote_config["custom_resources_per_worker"], + } + cls = RolloutWorker.as_remote(**remote_args).remote + self._remote_workers.extend([ + self._make_worker(cls, self._env_creator, self._policy, i + 1, + self._remote_config) for i in range(num_workers) + ]) + + def reset(self, new_remote_workers): + """Called to change the set of remote workers.""" + self._remote_workers = new_remote_workers + + def stop(self): + """Stop all rollout workers.""" + self.local_worker().stop() + for w in self.remote_workers(): + w.stop.remote() + w.__ray_terminate__.remote() + + @DeveloperAPI + def foreach_worker(self, func): + """Apply the given function to each worker instance.""" + + local_result = [func(self.local_worker())] + remote_results = ray_get_and_free( + [w.apply.remote(func) for w in self.remote_workers()]) + return local_result + remote_results + + @DeveloperAPI + def foreach_worker_with_index(self, func): + """Apply the given function to each worker instance. + + The index will be passed as the second arg to the given function. + """ + + local_result = [func(self.local_worker(), 0)] + remote_results = ray_get_and_free([ + w.apply.remote(func, i + 1) + for i, w in enumerate(self.remote_workers()) + ]) + return local_result + remote_results + + @staticmethod + def _from_existing(local_worker, remote_workers=None): + workers = WorkerSet(None, None, {}, _setup=False) + workers._local_worker = local_worker + workers._remote_workers = remote_workers or [] + return workers + + def _make_worker(self, cls, env_creator, policy, worker_index, config): + def session_creator(): + logger.debug("Creating TF session {}".format( + config["tf_session_args"])) + return tf.Session( + config=tf.ConfigProto(**config["tf_session_args"])) + + if isinstance(config["input"], FunctionType): + input_creator = config["input"] + elif config["input"] == "sampler": + input_creator = (lambda ioctx: ioctx.default_sampler_input()) + elif isinstance(config["input"], dict): + input_creator = (lambda ioctx: ShuffledInput( + MixedInput(config["input"], ioctx), config[ + "shuffle_buffer_size"])) + else: + input_creator = (lambda ioctx: ShuffledInput( + JsonReader(config["input"], ioctx), config[ + "shuffle_buffer_size"])) + + if isinstance(config["output"], FunctionType): + output_creator = config["output"] + elif config["output"] is None: + output_creator = (lambda ioctx: NoopOutput()) + elif config["output"] == "logdir": + output_creator = (lambda ioctx: JsonWriter( + ioctx.log_dir, + ioctx, + max_file_size=config["output_max_file_size"], + compress_columns=config["output_compress_columns"])) + else: + output_creator = (lambda ioctx: JsonWriter( + config["output"], + ioctx, + max_file_size=config["output_max_file_size"], + compress_columns=config["output_compress_columns"])) + + if config["input"] == "sampler": + input_evaluation = [] + else: + input_evaluation = config["input_evaluation"] + + # Fill in the default policy if 'None' is specified in multiagent + if config["multiagent"]["policies"]: + tmp = config["multiagent"]["policies"] + _validate_multiagent_config(tmp, allow_none_graph=True) + for k, v in tmp.items(): + if v[0] is None: + tmp[k] = (policy, v[1], v[2], v[3]) + policy = tmp + + return cls( + env_creator, + policy, + policy_mapping_fn=config["multiagent"]["policy_mapping_fn"], + policies_to_train=config["multiagent"]["policies_to_train"], + tf_session_creator=(session_creator + if config["tf_session_args"] else None), + batch_steps=config["sample_batch_size"], + batch_mode=config["batch_mode"], + episode_horizon=config["horizon"], + preprocessor_pref=config["preprocessor_pref"], + sample_async=config["sample_async"], + compress_observations=config["compress_observations"], + num_envs=config["num_envs_per_worker"], + observation_filter=config["observation_filter"], + clip_rewards=config["clip_rewards"], + clip_actions=config["clip_actions"], + env_config=config["env_config"], + model_config=config["model"], + policy_config=config, + worker_index=worker_index, + monitor_path=self._logdir if config["monitor"] else None, + log_dir=self._logdir, + log_level=config["log_level"], + callbacks=config["callbacks"], + input_creator=input_creator, + input_evaluation=input_evaluation, + output_creator=output_creator, + remote_worker_envs=config["remote_worker_envs"], + remote_env_batch_wait_ms=config["remote_env_batch_wait_ms"], + soft_horizon=config["soft_horizon"], + _fake_sampler=config.get("_fake_sampler", False)) diff --git a/python/ray/rllib/examples/multiagent_two_trainers.py b/python/ray/rllib/examples/multiagent_two_trainers.py index 68c0e742e..cdac4a2fd 100644 --- a/python/ray/rllib/examples/multiagent_two_trainers.py +++ b/python/ray/rllib/examples/multiagent_two_trainers.py @@ -75,7 +75,7 @@ if __name__ == "__main__": }) # disable DQN exploration when used by the PPO trainer - ppo_trainer.optimizer.foreach_evaluator( + ppo_trainer.workers.foreach_worker( lambda ev: ev.for_policy( lambda pi: pi.set_epsilon(0.0), policy_id="dqn_policy")) diff --git a/python/ray/rllib/examples/policy_evaluator_custom_workflow.py b/python/ray/rllib/examples/rollout_worker_custom_workflow.py similarity index 90% rename from python/ray/rllib/examples/policy_evaluator_custom_workflow.py rename to python/ray/rllib/examples/rollout_worker_custom_workflow.py index a8d80da99..fd1adc851 100644 --- a/python/ray/rllib/examples/policy_evaluator_custom_workflow.py +++ b/python/ray/rllib/examples/rollout_worker_custom_workflow.py @@ -1,4 +1,4 @@ -"""Example of using policy evaluator classes directly to implement training. +"""Example of using rollout worker classes directly to implement training. Instead of using the built-in Trainer classes provided by RLlib, here we define a custom Policy class and manually coordinate distributed sample @@ -15,7 +15,7 @@ import gym import ray from ray import tune from ray.rllib.policy import Policy -from ray.rllib.evaluation import PolicyEvaluator, SampleBatch +from ray.rllib.evaluation import RolloutWorker, SampleBatch from ray.rllib.evaluation.metrics import collect_metrics parser = argparse.ArgumentParser() @@ -67,8 +67,8 @@ def training_workflow(config, reporter): env = gym.make("CartPole-v0") policy = CustomPolicy(env.observation_space, env.action_space, {}) workers = [ - PolicyEvaluator.as_remote().remote(lambda c: gym.make("CartPole-v0"), - CustomPolicy) + RolloutWorker.as_remote().remote(lambda c: gym.make("CartPole-v0"), + CustomPolicy) for _ in range(config["num_workers"]) ] @@ -97,7 +97,7 @@ def training_workflow(config, reporter): # Do some arbitrary updates based on the T2 batch policy.update_some_value(sum(T2["rewards"])) - reporter(**collect_metrics(remote_evaluators=workers)) + reporter(**collect_metrics(remote_workers=workers)) if __name__ == "__main__": diff --git a/python/ray/rllib/offline/io_context.py b/python/ray/rllib/offline/io_context.py index 187c02f9c..58f7f03c5 100644 --- a/python/ray/rllib/offline/io_context.py +++ b/python/ray/rllib/offline/io_context.py @@ -18,20 +18,16 @@ class IOContext(object): config (dict): Configuration of the agent. worker_index (int): When there are multiple workers created, this uniquely identifies the current worker. - evaluator (PolicyEvaluator): policy evaluator object reference. + worker (RolloutWorker): rollout worker object reference. """ @PublicAPI - def __init__(self, - log_dir=None, - config=None, - worker_index=0, - evaluator=None): + def __init__(self, log_dir=None, config=None, worker_index=0, worker=None): self.log_dir = log_dir or os.getcwd() self.config = config or {} self.worker_index = worker_index - self.evaluator = evaluator + self.worker = worker @PublicAPI def default_sampler_input(self): - return self.evaluator.sampler + return self.worker.sampler diff --git a/python/ray/rllib/offline/json_reader.py b/python/ray/rllib/offline/json_reader.py index 55a002fb3..35d28669d 100644 --- a/python/ray/rllib/offline/json_reader.py +++ b/python/ray/rllib/offline/json_reader.py @@ -88,7 +88,7 @@ class JsonReader(InputReader): if isinstance(batch, SampleBatch): out = [] for sub_batch in batch.split_by_episode(): - out.append(self.ioctx.evaluator.policy_map[DEFAULT_POLICY_ID] + out.append(self.ioctx.worker.policy_map[DEFAULT_POLICY_ID] .postprocess_trajectory(sub_batch)) return SampleBatch.concat_samples(out) else: diff --git a/python/ray/rllib/offline/off_policy_estimator.py b/python/ray/rllib/offline/off_policy_estimator.py index 7534e667f..9d369f715 100644 --- a/python/ray/rllib/offline/off_policy_estimator.py +++ b/python/ray/rllib/offline/off_policy_estimator.py @@ -33,14 +33,14 @@ class OffPolicyEstimator(object): @classmethod def create(cls, ioctx): """Create an off-policy estimator from a IOContext.""" - gamma = ioctx.evaluator.policy_config["gamma"] + gamma = ioctx.worker.policy_config["gamma"] # Grab a reference to the current model - keys = list(ioctx.evaluator.policy_map.keys()) + keys = list(ioctx.worker.policy_map.keys()) if len(keys) > 1: raise NotImplementedError( "Off-policy estimation is not implemented for multi-agent. " "You can set `input_evaluation: []` to resolve this.") - policy = ioctx.evaluator.get_policy(keys[0]) + policy = ioctx.worker.get_policy(keys[0]) return cls(policy, gamma) @DeveloperAPI diff --git a/python/ray/rllib/optimizers/aso_aggregator.py b/python/ray/rllib/optimizers/aso_aggregator.py index c2ecb6ed1..bc7c75bbf 100644 --- a/python/ray/rllib/optimizers/aso_aggregator.py +++ b/python/ray/rllib/optimizers/aso_aggregator.py @@ -14,7 +14,7 @@ from ray.rllib.utils.memory import ray_get_and_free class Aggregator(object): - """An aggregator collects and processes samples from evaluators. + """An aggregator collects and processes samples from workers. This class is used to abstract away the strategy for sample collection. For example, you may want to use a tree of actors to collect samples. The @@ -22,21 +22,21 @@ class Aggregator(object): as concatenating and decompressing sample batches. Attributes: - local_evaluator: local PolicyEvaluator copy + local_worker: local RolloutWorker copy """ def iter_train_batches(self): """Returns a generator over batches ready to learn on. Iterating through this generator will also send out weight updates to - remote evaluators as needed. + remote workers as needed. This call may block until results are available. """ raise NotImplementedError def broadcast_new_weights(self): - """Broadcast a new set of weights from the local evaluator.""" + """Broadcast a new set of weights from the local workers.""" raise NotImplementedError def should_broadcast(self): @@ -47,19 +47,19 @@ class Aggregator(object): """Returns runtime statistics for debugging.""" raise NotImplementedError - def reset(self, remote_evaluators): - """Called to change the set of remote evaluators being used.""" + def reset(self, remote_workers): + """Called to change the set of remote workers being used.""" raise NotImplementedError class AggregationWorkerBase(object): """Aggregators should extend from this class.""" - def __init__(self, initial_weights_obj_id, remote_evaluators, + def __init__(self, initial_weights_obj_id, remote_workers, max_sample_requests_in_flight_per_worker, replay_proportion, replay_buffer_num_slots, train_batch_size, sample_batch_size): self.broadcasted_weights = initial_weights_obj_id - self.remote_evaluators = remote_evaluators + self.remote_workers = remote_workers self.sample_batch_size = sample_batch_size self.train_batch_size = train_batch_size @@ -73,7 +73,7 @@ class AggregationWorkerBase(object): # Kick off async background sampling self.sample_tasks = TaskPool() - for ev in self.remote_evaluators: + for ev in self.remote_workers: ev.set_weights.remote(self.broadcasted_weights) for _ in range(max_sample_requests_in_flight_per_worker): self.sample_tasks.add(ev, ev.sample.remote()) @@ -138,8 +138,8 @@ class AggregationWorkerBase(object): } @override(Aggregator) - def reset(self, remote_evaluators): - self.sample_tasks.reset_evaluators(remote_evaluators) + def reset(self, remote_workers): + self.sample_tasks.reset_workers(remote_workers) def _augment_with_replay(self, sample_futures): def can_replay(): @@ -164,25 +164,25 @@ class SimpleAggregator(AggregationWorkerBase, Aggregator): """Simple single-threaded implementation of an Aggregator.""" def __init__(self, - local_evaluator, - remote_evaluators, + workers, max_sample_requests_in_flight_per_worker=2, replay_proportion=0.0, replay_buffer_num_slots=0, train_batch_size=500, sample_batch_size=50, broadcast_interval=5): - self.local_evaluator = local_evaluator + self.workers = workers + self.local_worker = workers.local_worker() self.broadcast_interval = broadcast_interval self.broadcast_new_weights() AggregationWorkerBase.__init__( - self, self.broadcasted_weights, remote_evaluators, + self, self.broadcasted_weights, self.workers.remote_workers(), max_sample_requests_in_flight_per_worker, replay_proportion, replay_buffer_num_slots, train_batch_size, sample_batch_size) @override(Aggregator) def broadcast_new_weights(self): - self.broadcasted_weights = ray.put(self.local_evaluator.get_weights()) + self.broadcasted_weights = ray.put(self.local_worker.get_weights()) self.num_sent_since_broadcast = 0 @override(Aggregator) diff --git a/python/ray/rllib/optimizers/aso_learner.py b/python/ray/rllib/optimizers/aso_learner.py index 3bf87f660..74980bdf0 100644 --- a/python/ray/rllib/optimizers/aso_learner.py +++ b/python/ray/rllib/optimizers/aso_learner.py @@ -25,11 +25,11 @@ class LearnerThread(threading.Thread): improves overall throughput. """ - def __init__(self, local_evaluator, minibatch_buffer_size, num_sgd_iter, + def __init__(self, local_worker, minibatch_buffer_size, num_sgd_iter, learner_queue_size): threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) - self.local_evaluator = local_evaluator + self.local_worker = local_worker self.inqueue = queue.Queue(maxsize=learner_queue_size) self.outqueue = queue.Queue() self.minibatch_buffer = MinibatchBuffer( @@ -52,7 +52,7 @@ class LearnerThread(threading.Thread): batch, _ = self.minibatch_buffer.get() with self.grad_timer: - fetches = self.local_evaluator.learn_on_batch(batch) + fetches = self.local_worker.learn_on_batch(batch) self.weights_updated = True self.stats = get_learner_stats(fetches) diff --git a/python/ray/rllib/optimizers/aso_multi_gpu_learner.py b/python/ray/rllib/optimizers/aso_multi_gpu_learner.py index b5040e455..78058da44 100644 --- a/python/ray/rllib/optimizers/aso_multi_gpu_learner.py +++ b/python/ray/rllib/optimizers/aso_multi_gpu_learner.py @@ -31,7 +31,7 @@ class TFMultiGPULearner(LearnerThread): """ def __init__(self, - local_evaluator, + local_worker, num_gpus=1, lr=0.0005, train_batch_size=500, @@ -41,7 +41,7 @@ class TFMultiGPULearner(LearnerThread): learner_queue_size=16, num_data_load_threads=16, _fake_gpus=False): - LearnerThread.__init__(self, local_evaluator, minibatch_buffer_size, + LearnerThread.__init__(self, local_worker, minibatch_buffer_size, num_sgd_iter, learner_queue_size) self.lr = lr self.train_batch_size = train_batch_size @@ -59,16 +59,16 @@ class TFMultiGPULearner(LearnerThread): assert self.train_batch_size % len(self.devices) == 0 assert self.train_batch_size >= len(self.devices), "batch too small" - if set(self.local_evaluator.policy_map.keys()) != {DEFAULT_POLICY_ID}: + if set(self.local_worker.policy_map.keys()) != {DEFAULT_POLICY_ID}: raise NotImplementedError("Multi-gpu mode for multi-agent") - self.policy = self.local_evaluator.policy_map[DEFAULT_POLICY_ID] + self.policy = self.local_worker.policy_map[DEFAULT_POLICY_ID] # per-GPU graph copies created below must share vars with the policy # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.par_opt = [] - with self.local_evaluator.tf_sess.graph.as_default(): - with self.local_evaluator.tf_sess.as_default(): + with self.local_worker.tf_sess.graph.as_default(): + with self.local_worker.tf_sess.as_default(): with tf.variable_scope(DEFAULT_POLICY_ID, reuse=tf.AUTO_REUSE): if self.policy._state_inputs: rnn_inputs = self.policy._state_inputs + [ @@ -87,7 +87,7 @@ class TFMultiGPULearner(LearnerThread): 999999, # it will get rounded down self.policy.copy)) - self.sess = self.local_evaluator.tf_sess + self.sess = self.local_worker.tf_sess self.sess.run(tf.global_variables_initializer()) self.idle_optimizers = queue.Queue() diff --git a/python/ray/rllib/optimizers/aso_tree_aggregator.py b/python/ray/rllib/optimizers/aso_tree_aggregator.py index cf51bce25..75677e313 100644 --- a/python/ray/rllib/optimizers/aso_tree_aggregator.py +++ b/python/ray/rllib/optimizers/aso_tree_aggregator.py @@ -22,15 +22,14 @@ logger = logging.getLogger(__name__) class TreeAggregator(Aggregator): """A hierarchical experiences aggregator. - The given set of remote evaluators is divided into subsets and assigned to + The given set of remote workers is divided into subsets and assigned to one of several aggregation workers. These aggregation workers collate experiences into batches of size `train_batch_size` and we collect them in this class when `iter_train_batches` is called. """ def __init__(self, - local_evaluator, - remote_evaluators, + workers, num_aggregation_workers, max_sample_requests_in_flight_per_worker=2, replay_proportion=0.0, @@ -38,8 +37,7 @@ class TreeAggregator(Aggregator): train_batch_size=500, sample_batch_size=50, broadcast_interval=5): - self.local_evaluator = local_evaluator - self.remote_evaluators = remote_evaluators + self.workers = workers self.num_aggregation_workers = num_aggregation_workers self.max_sample_requests_in_flight_per_worker = \ max_sample_requests_in_flight_per_worker @@ -48,7 +46,8 @@ class TreeAggregator(Aggregator): self.sample_batch_size = sample_batch_size self.train_batch_size = train_batch_size self.broadcast_interval = broadcast_interval - self.broadcasted_weights = ray.put(local_evaluator.get_weights()) + self.broadcasted_weights = ray.put( + workers.local_worker().get_weights()) self.num_batches_processed = 0 self.num_broadcasts = 0 self.num_sent_since_broadcast = 0 @@ -58,26 +57,27 @@ class TreeAggregator(Aggregator): """Deferred init so that we can pass in previously created workers.""" assert len(aggregators) == self.num_aggregation_workers, aggregators - if len(self.remote_evaluators) < self.num_aggregation_workers: + if len(self.workers.remote_workers()) < self.num_aggregation_workers: raise ValueError( "The number of aggregation workers should not exceed the " "number of total evaluation workers ({} vs {})".format( - self.num_aggregation_workers, len(self.remote_evaluators))) + self.num_aggregation_workers, + len(self.workers.remote_workers()))) - assigned_evaluators = collections.defaultdict(list) - for i, ev in enumerate(self.remote_evaluators): - assigned_evaluators[i % self.num_aggregation_workers].append(ev) + assigned_workers = collections.defaultdict(list) + for i, ev in enumerate(self.workers.remote_workers()): + assigned_workers[i % self.num_aggregation_workers].append(ev) - self.workers = aggregators - for i, worker in enumerate(self.workers): - worker.init.remote( - self.broadcasted_weights, assigned_evaluators[i], - self.max_sample_requests_in_flight_per_worker, - self.replay_proportion, self.replay_buffer_num_slots, - self.train_batch_size, self.sample_batch_size) + self.aggregators = aggregators + for i, agg in enumerate(self.aggregators): + agg.init.remote(self.broadcasted_weights, assigned_workers[i], + self.max_sample_requests_in_flight_per_worker, + self.replay_proportion, + self.replay_buffer_num_slots, + self.train_batch_size, self.sample_batch_size) self.agg_tasks = TaskPool() - for agg in self.workers: + for agg in self.aggregators: agg.set_weights.remote(self.broadcasted_weights) self.agg_tasks.add(agg, agg.get_train_batches.remote()) @@ -96,7 +96,8 @@ class TreeAggregator(Aggregator): @override(Aggregator) def broadcast_new_weights(self): - self.broadcasted_weights = ray.put(self.local_evaluator.get_weights()) + self.broadcasted_weights = ray.put( + self.workers.local_worker().get_weights()) self.num_sent_since_broadcast = 0 self.num_broadcasts += 1 @@ -112,8 +113,8 @@ class TreeAggregator(Aggregator): } @override(Aggregator) - def reset(self, remote_evaluators): - raise NotImplementedError("changing number of remote evaluators") + def reset(self, remote_workers): + raise NotImplementedError("changing number of remote workers") @staticmethod def precreate_aggregators(n): @@ -125,16 +126,16 @@ class AggregationWorker(AggregationWorkerBase): def __init__(self): self.initialized = False - def init(self, initial_weights_obj_id, remote_evaluators, + def init(self, initial_weights_obj_id, remote_workers, max_sample_requests_in_flight_per_worker, replay_proportion, replay_buffer_num_slots, train_batch_size, sample_batch_size): """Deferred init that assigns sub-workers to this aggregator.""" - logger.info("Assigned evaluators {} to aggregation worker {}".format( - remote_evaluators, self)) - assert remote_evaluators + logger.info("Assigned workers {} to aggregation worker {}".format( + remote_workers, self)) + assert remote_workers AggregationWorkerBase.__init__( - self, initial_weights_obj_id, remote_evaluators, + self, initial_weights_obj_id, remote_workers, max_sample_requests_in_flight_per_worker, replay_proportion, replay_buffer_num_slots, train_batch_size, sample_batch_size) self.initialized = True diff --git a/python/ray/rllib/optimizers/async_gradients_optimizer.py b/python/ray/rllib/optimizers/async_gradients_optimizer.py index 2b46e1259..05f266b66 100644 --- a/python/ray/rllib/optimizers/async_gradients_optimizer.py +++ b/python/ray/rllib/optimizers/async_gradients_optimizer.py @@ -14,30 +14,30 @@ class AsyncGradientsOptimizer(PolicyOptimizer): """An asynchronous RL optimizer, e.g. for implementing A3C. This optimizer asynchronously pulls and applies gradients from remote - evaluators, sending updated weights back as needed. This pipelines the + workers, sending updated weights back as needed. This pipelines the gradient computations on the remote workers. """ - def __init__(self, local_evaluator, remote_evaluators, grads_per_step=100): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + def __init__(self, workers, grads_per_step=100): + PolicyOptimizer.__init__(self, workers) self.apply_timer = TimerStat() self.wait_timer = TimerStat() self.dispatch_timer = TimerStat() self.grads_per_step = grads_per_step self.learner_stats = {} - if not self.remote_evaluators: + if not self.workers.remote_workers(): raise ValueError( - "Async optimizer requires at least 1 remote evaluator") + "Async optimizer requires at least 1 remote workers") @override(PolicyOptimizer) def step(self): - weights = ray.put(self.local_evaluator.get_weights()) + weights = ray.put(self.workers.local_worker().get_weights()) pending_gradients = {} num_gradients = 0 # Kick off the first wave of async tasks - for e in self.remote_evaluators: + for e in self.workers.remote_workers(): e.set_weights.remote(weights) future = e.compute_gradients.remote(e.sample.remote()) pending_gradients[future] = e @@ -56,13 +56,14 @@ class AsyncGradientsOptimizer(PolicyOptimizer): if gradient is not None: with self.apply_timer: - self.local_evaluator.apply_gradients(gradient) + self.workers.local_worker().apply_gradients(gradient) self.num_steps_sampled += info["batch_count"] self.num_steps_trained += info["batch_count"] if num_gradients < self.grads_per_step: with self.dispatch_timer: - e.set_weights.remote(self.local_evaluator.get_weights()) + e.set_weights.remote( + self.workers.local_worker().get_weights()) future = e.compute_gradients.remote(e.sample.remote()) pending_gradients[future] = e diff --git a/python/ray/rllib/optimizers/async_replay_optimizer.py b/python/ray/rllib/optimizers/async_replay_optimizer.py index d66f942ae..0b99cef2d 100644 --- a/python/ray/rllib/optimizers/async_replay_optimizer.py +++ b/python/ray/rllib/optimizers/async_replay_optimizer.py @@ -36,20 +36,19 @@ class AsyncReplayOptimizer(PolicyOptimizer): """Main event loop of the Ape-X optimizer (async sampling with replay). This class coordinates the data transfers between the learner thread, - remote evaluators (Ape-X actors), and replay buffer actors. + remote workers (Ape-X actors), and replay buffer actors. This has two modes of operation: - normal replay: replays independent samples. - batch replay: simplified mode where entire sample batches are replayed. This supports RNNs, but not prioritization. - This optimizer requires that policy evaluators return an additional + This optimizer requires that rollout workers return an additional "td_error" array in the info return of compute_gradients(). This error term will be used for sample prioritization.""" def __init__(self, - local_evaluator, - remote_evaluators, + workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, @@ -62,7 +61,7 @@ class AsyncReplayOptimizer(PolicyOptimizer): max_weight_sync_delay=400, debug=False, batch_replay=False): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + PolicyOptimizer.__init__(self, workers) self.debug = debug self.batch_replay = batch_replay @@ -71,7 +70,7 @@ class AsyncReplayOptimizer(PolicyOptimizer): self.prioritized_replay_eps = prioritized_replay_eps self.max_weight_sync_delay = max_weight_sync_delay - self.learner = LearnerThread(self.local_evaluator) + self.learner = LearnerThread(self.workers.local_worker()) self.learner.start() if self.batch_replay: @@ -111,13 +110,13 @@ class AsyncReplayOptimizer(PolicyOptimizer): # Kick off async background sampling self.sample_tasks = TaskPool() - if self.remote_evaluators: - self._set_evaluators(self.remote_evaluators) + if self.workers.remote_workers(): + self._set_workers(self.workers.remote_workers()) @override(PolicyOptimizer) def step(self): assert self.learner.is_alive() - assert len(self.remote_evaluators) > 0 + assert len(self.workers.remote_workers()) > 0 start = time.time() sample_timesteps, train_timesteps = self._step() time_delta = time.time() - start @@ -138,9 +137,9 @@ class AsyncReplayOptimizer(PolicyOptimizer): self.learner.stopped = True @override(PolicyOptimizer) - def reset(self, remote_evaluators): - self.remote_evaluators = remote_evaluators - self.sample_tasks.reset_evaluators(remote_evaluators) + def reset(self, remote_workers): + self.workers.reset(remote_workers) + self.sample_tasks.reset_workers(remote_workers) @override(PolicyOptimizer) def stats(self): @@ -175,10 +174,10 @@ class AsyncReplayOptimizer(PolicyOptimizer): return dict(PolicyOptimizer.stats(self), **stats) # For https://github.com/ray-project/ray/issues/2541 only - def _set_evaluators(self, remote_evaluators): - self.remote_evaluators = remote_evaluators - weights = self.local_evaluator.get_weights() - for ev in self.remote_evaluators: + def _set_workers(self, remote_workers): + self.workers.reset(remote_workers) + weights = self.workers.local_worker().get_weights() + for ev in self.workers.remote_workers(): ev.set_weights.remote(weights) self.steps_since_update[ev] = 0 for _ in range(SAMPLE_QUEUE_DEPTH): @@ -207,7 +206,7 @@ class AsyncReplayOptimizer(PolicyOptimizer): self.learner.weights_updated = False with self.timers["put_weights"]: weights = ray.put( - self.local_evaluator.get_weights()) + self.workers.local_worker().get_weights()) ev.set_weights.remote(weights) self.num_weight_syncs += 1 self.steps_since_update[ev] = 0 @@ -380,10 +379,10 @@ class LearnerThread(threading.Thread): improves overall throughput. """ - def __init__(self, local_evaluator): + def __init__(self, local_worker): threading.Thread.__init__(self) self.learner_queue_size = WindowStat("size", 50) - self.local_evaluator = local_evaluator + self.local_worker = local_worker self.inqueue = queue.Queue(maxsize=LEARNER_QUEUE_MAX_SIZE) self.outqueue = queue.Queue() self.queue_timer = TimerStat() @@ -403,7 +402,7 @@ class LearnerThread(threading.Thread): if replay is not None: prio_dict = {} with self.grad_timer: - grad_out = self.local_evaluator.learn_on_batch(replay) + grad_out = self.local_worker.learn_on_batch(replay) for pid, info in grad_out.items(): prio_dict[pid] = ( replay.policy_batches[pid].data.get("batch_indexes"), diff --git a/python/ray/rllib/optimizers/async_samples_optimizer.py b/python/ray/rllib/optimizers/async_samples_optimizer.py index e2ff320e6..1e3afb8fb 100644 --- a/python/ray/rllib/optimizers/async_samples_optimizer.py +++ b/python/ray/rllib/optimizers/async_samples_optimizer.py @@ -24,12 +24,11 @@ class AsyncSamplesOptimizer(PolicyOptimizer): """Main event loop of the IMPALA architecture. This class coordinates the data transfers between the learner thread - and remote evaluators (IMPALA actors). + and remote workers (IMPALA actors). """ def __init__(self, - local_evaluator, - remote_evaluators, + workers, train_batch_size=500, sample_batch_size=50, num_envs_per_worker=1, @@ -45,7 +44,7 @@ class AsyncSamplesOptimizer(PolicyOptimizer): learner_queue_size=16, num_aggregation_workers=0, _fake_gpus=False): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + PolicyOptimizer.__init__(self, workers) self._stats_start_time = time.time() self._last_stats_time = {} @@ -62,7 +61,7 @@ class AsyncSamplesOptimizer(PolicyOptimizer): "{} vs {}".format(num_data_loader_buffers, minibatch_buffer_size)) self.learner = TFMultiGPULearner( - self.local_evaluator, + self.workers.local_worker(), lr=lr, num_gpus=num_gpus, train_batch_size=train_batch_size, @@ -72,7 +71,7 @@ class AsyncSamplesOptimizer(PolicyOptimizer): learner_queue_size=learner_queue_size, _fake_gpus=_fake_gpus) else: - self.learner = LearnerThread(self.local_evaluator, + self.learner = LearnerThread(self.workers.local_worker(), minibatch_buffer_size, num_sgd_iter, learner_queue_size) self.learner.start() @@ -84,8 +83,7 @@ class AsyncSamplesOptimizer(PolicyOptimizer): if num_aggregation_workers > 0: self.aggregator = TreeAggregator( - self.local_evaluator, - self.remote_evaluators, + workers, num_aggregation_workers, replay_proportion=replay_proportion, max_sample_requests_in_flight_per_worker=( @@ -96,8 +94,7 @@ class AsyncSamplesOptimizer(PolicyOptimizer): broadcast_interval=broadcast_interval) else: self.aggregator = SimpleAggregator( - self.local_evaluator, - self.remote_evaluators, + workers, replay_proportion=replay_proportion, max_sample_requests_in_flight_per_worker=( max_sample_requests_in_flight_per_worker), @@ -127,7 +124,7 @@ class AsyncSamplesOptimizer(PolicyOptimizer): @override(PolicyOptimizer) def step(self): - if len(self.remote_evaluators) == 0: + if len(self.workers.remote_workers()) == 0: raise ValueError("Config num_workers=0 means training will hang!") assert self.learner.is_alive() with self._optimizer_step_timer: @@ -146,9 +143,9 @@ class AsyncSamplesOptimizer(PolicyOptimizer): self.learner.stopped = True @override(PolicyOptimizer) - def reset(self, remote_evaluators): - self.remote_evaluators = remote_evaluators - self.aggregator.reset(remote_evaluators) + def reset(self, remote_workers): + self.workers.reset(remote_workers) + self.aggregator.reset(remote_workers) @override(PolicyOptimizer) def stats(self): diff --git a/python/ray/rllib/optimizers/multi_gpu_optimizer.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py index a25553c40..65d7842d8 100644 --- a/python/ray/rllib/optimizers/multi_gpu_optimizer.py +++ b/python/ray/rllib/optimizers/multi_gpu_optimizer.py @@ -28,7 +28,7 @@ logger = logging.getLogger(__name__) class LocalMultiGPUOptimizer(PolicyOptimizer): """A synchronous optimizer that uses multiple local GPUs. - Samples are pulled synchronously from multiple remote evaluators, + Samples are pulled synchronously from multiple remote workers, concatenated, and then split across the memory of multiple local GPUs. A number of SGD passes are then taken over the in-memory data. For more details, see `multi_gpu_impl.LocalSyncParallelOptimizer`. @@ -42,8 +42,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): """ def __init__(self, - local_evaluator, - remote_evaluators, + workers, sgd_batch_size=128, num_sgd_iter=10, sample_batch_size=200, @@ -52,7 +51,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): num_gpus=0, standardize_fields=[], straggler_mitigation=False): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + PolicyOptimizer.__init__(self, workers) self.batch_size = sgd_batch_size self.num_sgd_iter = num_sgd_iter @@ -79,8 +78,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): logger.info("LocalMultiGPUOptimizer devices {}".format(self.devices)) - self.policies = dict( - self.local_evaluator.foreach_trainable_policy(lambda p, i: (i, p))) + self.policies = dict(self.workers.local_worker() + .foreach_trainable_policy(lambda p, i: (i, p))) logger.debug("Policies to train: {}".format(self.policies)) for policy_id, policy in self.policies.items(): if not isinstance(policy, TFPolicy): @@ -92,8 +91,8 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): # reuse is set to AUTO_REUSE because Adam nodes are created after # all of the device copies are created. self.optimizers = {} - with self.local_evaluator.tf_sess.graph.as_default(): - with self.local_evaluator.tf_sess.as_default(): + with self.workers.local_worker().tf_sess.graph.as_default(): + with self.workers.local_worker().tf_sess.as_default(): for policy_id, policy in self.policies.items(): with tf.variable_scope(policy_id, reuse=tf.AUTO_REUSE): if policy._state_inputs: @@ -109,25 +108,25 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): for _, v in policy._loss_inputs], rnn_inputs, self.per_device_batch_size, policy.copy)) - self.sess = self.local_evaluator.tf_sess + self.sess = self.workers.local_worker().tf_sess self.sess.run(tf.global_variables_initializer()) @override(PolicyOptimizer) def step(self): with self.update_weights_timer: - if self.remote_evaluators: - weights = ray.put(self.local_evaluator.get_weights()) - for e in self.remote_evaluators: + if self.workers.remote_workers(): + weights = ray.put(self.workers.local_worker().get_weights()) + for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: - if self.remote_evaluators: + if self.workers.remote_workers(): if self.straggler_mitigation: samples = collect_samples_straggler_mitigation( - self.remote_evaluators, self.train_batch_size) + self.workers.remote_workers(), self.train_batch_size) else: samples = collect_samples( - self.remote_evaluators, self.sample_batch_size, + self.workers.remote_workers(), self.sample_batch_size, self.num_envs_per_worker, self.train_batch_size) if samples.count > self.train_batch_size * 2: logger.info( @@ -139,7 +138,7 @@ class LocalMultiGPUOptimizer(PolicyOptimizer): else: samples = [] while sum(s.count for s in samples) < self.train_batch_size: - samples.append(self.local_evaluator.sample()) + samples.append(self.workers.local_worker().sample()) samples = SampleBatch.concat_samples(samples) # Handle everything as if multiagent diff --git a/python/ray/rllib/optimizers/policy_optimizer.py b/python/ray/rllib/optimizers/policy_optimizer.py index f67ea9cdc..29287e964 100644 --- a/python/ray/rllib/optimizers/policy_optimizer.py +++ b/python/ray/rllib/optimizers/policy_optimizer.py @@ -6,7 +6,6 @@ import logging from ray.rllib.utils.annotations import DeveloperAPI from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes -from ray.rllib.utils.memory import ray_get_and_free logger = logging.getLogger(__name__) @@ -21,34 +20,21 @@ class PolicyOptimizer(object): used for PPO. These optimizers are all pluggable, and it is possible to mix and match as needed. - In order for an algorithm to use an RLlib optimizer, it must implement - the PolicyEvaluator interface and pass a PolicyEvaluator class or set of - PolicyEvaluators to its PolicyOptimizer of choice. The PolicyOptimizer - uses these Evaluators to sample from the environment and compute model - gradient updates. - Attributes: config (dict): The JSON configuration passed to this optimizer. - local_evaluator (PolicyEvaluator): The embedded evaluator instance. - remote_evaluators (list): List of remote evaluator replicas, or []. + workers (WorkerSet): The set of rollout workers to use. num_steps_trained (int): Number of timesteps trained on so far. num_steps_sampled (int): Number of timesteps sampled so far. - evaluator_resources (dict): Optional resource requests to set for - evaluators created by this optimizer. """ @DeveloperAPI - def __init__(self, local_evaluator, remote_evaluators=None): + def __init__(self, workers): """Create an optimizer instance. Args: - local_evaluator (Evaluator): Local evaluator instance, required. - remote_evaluators (list): A list of Ray actor handles to remote - evaluators instances. If empty, the optimizer should fall back - to using only the local evaluator. + workers (WorkerSet): The set of rollout workers to use. """ - self.local_evaluator = local_evaluator - self.remote_evaluators = remote_evaluators or [] + self.workers = workers self.episode_history = [] # Counters that should be updated by sub-classes @@ -100,23 +86,23 @@ class PolicyOptimizer(object): def collect_metrics(self, timeout_seconds, min_history=100, - selected_evaluators=None): - """Returns evaluator and optimizer stats. + selected_workers=None): + """Returns worker and optimizer stats. Arguments: - timeout_seconds (int): Max wait time for a evaluator before - dropping its results. This usually indicates a hung evaluator. + timeout_seconds (int): Max wait time for a worker before + dropping its results. This usually indicates a hung worker. min_history (int): Min history length to smooth results over. - selected_evaluators (list): Override the list of remote evaluators + selected_workers (list): Override the list of remote workers to collect metrics from. Returns: - res (dict): A training result dict from evaluator metrics with + res (dict): A training result dict from worker metrics with `info` replaced with stats from self. """ episodes, num_dropped = collect_episodes( - self.local_evaluator, - selected_evaluators or self.remote_evaluators, + self.workers.local_worker(), + selected_workers or self.workers.remote_workers(), timeout_seconds=timeout_seconds) orig_episodes = list(episodes) missing = min_history - len(episodes) @@ -130,30 +116,28 @@ class PolicyOptimizer(object): return res @DeveloperAPI - def reset(self, remote_evaluators): - """Called to change the set of remote evaluators being used.""" - - self.remote_evaluators = remote_evaluators + def reset(self, remote_workers): + """Called to change the set of remote workers being used.""" + self.workers.reset(remote_workers) @DeveloperAPI - def foreach_evaluator(self, func): - """Apply the given function to each evaluator instance.""" - - local_result = [func(self.local_evaluator)] - remote_results = ray_get_and_free( - [ev.apply.remote(func) for ev in self.remote_evaluators]) - return local_result + remote_results + def foreach_worker(self, func): + """Apply the given function to each worker instance.""" + return self.workers.foreach_worker(func) @DeveloperAPI - def foreach_evaluator_with_index(self, func): - """Apply the given function to each evaluator instance. + def foreach_worker_with_index(self, func): + """Apply the given function to each worker instance. The index will be passed as the second arg to the given function. """ + return self.workers.foreach_worker_with_index(func) - local_result = [func(self.local_evaluator, 0)] - remote_results = ray_get_and_free([ - ev.apply.remote(func, i + 1) - for i, ev in enumerate(self.remote_evaluators) - ]) - return local_result + remote_results + def foreach_evaluator(self, func): + raise DeprecationWarning( + "foreach_evaluator has been renamed to foreach_worker") + + def foreach_evaluator_with_index(self, func): + raise DeprecationWarning( + "foreach_evaluator_with_index has been renamed to " + "foreach_worker_with_index") diff --git a/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py b/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py index e13d71c6e..e2b4865da 100644 --- a/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py +++ b/python/ray/rllib/optimizers/sync_batch_replay_optimizer.py @@ -20,12 +20,11 @@ class SyncBatchReplayOptimizer(PolicyOptimizer): This enables RNN support. Does not currently support prioritization.""" def __init__(self, - local_evaluator, - remote_evaluators, + workers, learning_starts=1000, buffer_size=10000, train_batch_size=32): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts self.max_buffer_size = buffer_size @@ -45,17 +44,17 @@ class SyncBatchReplayOptimizer(PolicyOptimizer): @override(PolicyOptimizer) def step(self): with self.update_weights_timer: - if self.remote_evaluators: - weights = ray.put(self.local_evaluator.get_weights()) - for e in self.remote_evaluators: + if self.workers.remote_workers(): + weights = ray.put(self.workers.local_worker().get_weights()) + for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: - if self.remote_evaluators: + if self.workers.remote_workers(): batches = ray_get_and_free( - [e.sample.remote() for e in self.remote_evaluators]) + [e.sample.remote() for e in self.workers.remote_workers()]) else: - batches = [self.local_evaluator.sample()] + batches = [self.workers.local_worker().sample()] # Handle everything as if multiagent tmp = [] @@ -105,7 +104,7 @@ class SyncBatchReplayOptimizer(PolicyOptimizer): samples.append(random.choice(self.replay_buffer)) samples = SampleBatch.concat_samples(samples) with self.grad_timer: - info_dict = self.local_evaluator.learn_on_batch(samples) + info_dict = self.workers.local_worker().learn_on_batch(samples) for policy_id, info in info_dict.items(): self.learner_stats[policy_id] = get_learner_stats(info) self.grad_timer.push_units_processed(samples.count) diff --git a/python/ray/rllib/optimizers/sync_replay_optimizer.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py index 27858f352..881e02f90 100644 --- a/python/ray/rllib/optimizers/sync_replay_optimizer.py +++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py @@ -25,13 +25,12 @@ logger = logging.getLogger(__name__) class SyncReplayOptimizer(PolicyOptimizer): """Variant of the local sync optimizer that supports replay (for DQN). - This optimizer requires that policy evaluators return an additional + This optimizer requires that rollout workers return an additional "td_error" array in the info return of compute_gradients(). This error term will be used for sample prioritization.""" def __init__(self, - local_evaluator, - remote_evaluators, + workers, learning_starts=1000, buffer_size=10000, prioritized_replay=True, @@ -43,7 +42,7 @@ class SyncReplayOptimizer(PolicyOptimizer): prioritized_replay_eps=1e-6, train_batch_size=32, sample_batch_size=4): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + PolicyOptimizer.__init__(self, workers) self.replay_starts = learning_starts # linearly annealing beta used in Rainbow paper @@ -82,18 +81,20 @@ class SyncReplayOptimizer(PolicyOptimizer): @override(PolicyOptimizer) def step(self): with self.update_weights_timer: - if self.remote_evaluators: - weights = ray.put(self.local_evaluator.get_weights()) - for e in self.remote_evaluators: + if self.workers.remote_workers(): + weights = ray.put(self.workers.local_worker().get_weights()) + for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: - if self.remote_evaluators: + if self.workers.remote_workers(): batch = SampleBatch.concat_samples( - ray_get_and_free( - [e.sample.remote() for e in self.remote_evaluators])) + ray_get_and_free([ + e.sample.remote() + for e in self.workers.remote_workers() + ])) else: - batch = self.local_evaluator.sample() + batch = self.workers.local_worker().sample() # Handle everything as if multiagent if isinstance(batch, SampleBatch): @@ -135,7 +136,7 @@ class SyncReplayOptimizer(PolicyOptimizer): samples = self._replay() with self.grad_timer: - info_dict = self.local_evaluator.learn_on_batch(samples) + info_dict = self.workers.local_worker().learn_on_batch(samples) for policy_id, info in info_dict.items(): self.learner_stats[policy_id] = get_learner_stats(info) replay_buffer = self.replay_buffers[policy_id] diff --git a/python/ray/rllib/optimizers/sync_samples_optimizer.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py index a49b290d3..0f79062a3 100644 --- a/python/ray/rllib/optimizers/sync_samples_optimizer.py +++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py @@ -19,16 +19,12 @@ class SyncSamplesOptimizer(PolicyOptimizer): """A simple synchronous RL optimizer. In each step, this optimizer pulls samples from a number of remote - evaluators, concatenates them, and then updates a local model. The updated - model weights are then broadcast to all remote evaluators. + workers, concatenates them, and then updates a local model. The updated + model weights are then broadcast to all remote workers. """ - def __init__(self, - local_evaluator, - remote_evaluators, - num_sgd_iter=1, - train_batch_size=1): - PolicyOptimizer.__init__(self, local_evaluator, remote_evaluators) + def __init__(self, workers, num_sgd_iter=1, train_batch_size=1): + PolicyOptimizer.__init__(self, workers) self.update_weights_timer = TimerStat() self.sample_timer = TimerStat() @@ -41,27 +37,28 @@ class SyncSamplesOptimizer(PolicyOptimizer): @override(PolicyOptimizer) def step(self): with self.update_weights_timer: - if self.remote_evaluators: - weights = ray.put(self.local_evaluator.get_weights()) - for e in self.remote_evaluators: + if self.workers.remote_workers(): + weights = ray.put(self.workers.local_worker().get_weights()) + for e in self.workers.remote_workers(): e.set_weights.remote(weights) with self.sample_timer: samples = [] while sum(s.count for s in samples) < self.train_batch_size: - if self.remote_evaluators: + if self.workers.remote_workers(): samples.extend( ray_get_and_free([ - e.sample.remote() for e in self.remote_evaluators + e.sample.remote() + for e in self.workers.remote_workers() ])) else: - samples.append(self.local_evaluator.sample()) + samples.append(self.workers.local_worker().sample()) samples = SampleBatch.concat_samples(samples) self.sample_timer.push_units_processed(samples.count) with self.grad_timer: for i in range(self.num_sgd_iter): - fetches = self.local_evaluator.learn_on_batch(samples) + fetches = self.workers.local_worker().learn_on_batch(samples) self.learner_stats = get_learner_stats(fetches) if self.num_sgd_iter > 1: logger.debug("{} {}".format(i, fetches)) diff --git a/python/ray/rllib/policy/dynamic_tf_policy.py b/python/ray/rllib/policy/dynamic_tf_policy.py index afa72a0af..0240f275d 100644 --- a/python/ray/rllib/policy/dynamic_tf_policy.py +++ b/python/ray/rllib/policy/dynamic_tf_policy.py @@ -142,7 +142,7 @@ class DynamicTFPolicy(TFPolicy): action_prob = self.action_dist.sampled_action_prob() # Phase 1 init - sess = tf.get_default_session() + sess = tf.get_default_session() or tf.Session() if get_batch_divisibility_req: batch_divisibility_req = get_batch_divisibility_req(self) else: diff --git a/python/ray/rllib/policy/policy.py b/python/ray/rllib/policy/policy.py index 6f456e608..e12cafef2 100644 --- a/python/ray/rllib/policy/policy.py +++ b/python/ray/rllib/policy/policy.py @@ -36,7 +36,7 @@ class Policy(object): """Initialize the graph. This is the standard constructor for policies. The policy - class you pass into PolicyEvaluator will be constructed with + class you pass into RolloutWorker will be constructed with these arguments. Args: diff --git a/python/ray/rllib/policy/tf_policy_template.py b/python/ray/rllib/policy/tf_policy_template.py index 7f10958cd..b7f33fcb0 100644 --- a/python/ray/rllib/policy/tf_policy_template.py +++ b/python/ray/rllib/policy/tf_policy_template.py @@ -88,9 +88,7 @@ def build_tf_policy(name, a DynamicTFPolicy instance that uses the specified args """ - if not name.endswith("TFPolicy"): - raise ValueError("Name should match *TFPolicy", name) - + original_kwargs = locals().copy() base = DynamicTFPolicy while mixins: @@ -191,6 +189,11 @@ def build_tf_policy(name, else: return TFPolicy.extra_compute_grad_feed_dict(self) + @staticmethod + def with_updates(**overrides): + return build_tf_policy(**dict(original_kwargs, **overrides)) + + policy_cls.with_updates = with_updates policy_cls.__name__ = name policy_cls.__qualname__ = name return policy_cls diff --git a/python/ray/rllib/policy/torch_policy_template.py b/python/ray/rllib/policy/torch_policy_template.py index 19e943600..1f4185f9c 100644 --- a/python/ray/rllib/policy/torch_policy_template.py +++ b/python/ray/rllib/policy/torch_policy_template.py @@ -24,7 +24,7 @@ def build_torch_policy(name, """Helper function for creating a torch policy at runtime. Arguments: - name (str): name of the policy (e.g., "PPOTFPolicy") + name (str): name of the policy (e.g., "PPOTorchPolicy") loss_fn (func): function that returns a loss tensor the policy, and dict of experience tensor placeholders get_default_config (func): optional function that returns the default @@ -55,9 +55,7 @@ def build_torch_policy(name, a TorchPolicy instance that uses the specified args """ - if not name.endswith("TorchPolicy"): - raise ValueError("Name should match *TorchPolicy", name) - + original_kwargs = locals().copy() base = TorchPolicy while mixins: @@ -66,7 +64,7 @@ def build_torch_policy(name, base = new_base - class graph_cls(base): + class policy_cls(base): def __init__(self, obs_space, action_space, config): if get_default_config: config = dict(get_default_config(), **config) @@ -130,6 +128,11 @@ def build_torch_policy(name, else: return TorchPolicy.extra_grad_info(self, batch_tensors) - graph_cls.__name__ = name - graph_cls.__qualname__ = name - return graph_cls + @staticmethod + def with_updates(**overrides): + return build_torch_policy(**dict(original_kwargs, **overrides)) + + policy_cls.with_updates = with_updates + policy_cls.__name__ = name + policy_cls.__qualname__ = name + return policy_cls diff --git a/python/ray/rllib/rollout.py b/python/ray/rllib/rollout.py index efa5743c0..d8292739f 100755 --- a/python/ray/rllib/rollout.py +++ b/python/ray/rllib/rollout.py @@ -120,14 +120,14 @@ def default_policy_agent_mapping(unused_agent_id): def rollout(agent, env_name, num_steps, out=None, no_render=True): policy_agent_mapping = default_policy_agent_mapping - if hasattr(agent, "local_evaluator"): - env = agent.local_evaluator.env + if hasattr(agent, "workers"): + env = agent.workers.local_worker().env multiagent = isinstance(env, MultiAgentEnv) - if agent.local_evaluator.multiagent: + if agent.workers.local_worker().multiagent: policy_agent_mapping = agent.config["multiagent"][ "policy_mapping_fn"] - policy_map = agent.local_evaluator.policy_map + policy_map = agent.workers.local_worker().policy_map state_init = {p: m.get_initial_state() for p, m in policy_map.items()} use_lstm = {p: len(s) > 0 for p, s in state_init.items()} action_init = { diff --git a/python/ray/rllib/tests/mock_evaluator.py b/python/ray/rllib/tests/mock_worker.py similarity index 98% rename from python/ray/rllib/tests/mock_evaluator.py rename to python/ray/rllib/tests/mock_worker.py index e11b097e7..b6b2e9773 100644 --- a/python/ray/rllib/tests/mock_evaluator.py +++ b/python/ray/rllib/tests/mock_worker.py @@ -8,7 +8,7 @@ from ray.rllib.evaluation import SampleBatch from ray.rllib.utils.filter import MeanStdFilter -class _MockEvaluator(object): +class _MockWorker(object): def __init__(self, sample_count=10): self._weights = np.array([-10, -10, -10, -10]) self._grad = np.array([1, 1, 1, 1]) diff --git a/python/ray/rllib/tests/test_external_env.py b/python/ray/rllib/tests/test_external_env.py index 3b2158959..24281e757 100644 --- a/python/ray/rllib/tests/test_external_env.py +++ b/python/ray/rllib/tests/test_external_env.py @@ -11,10 +11,10 @@ import uuid import ray from ray.rllib.agents.dqn import DQNTrainer from ray.rllib.agents.pg import PGTrainer -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator +from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.env.external_env import ExternalEnv -from ray.rllib.tests.test_policy_evaluator import (BadPolicy, MockPolicy, - MockEnv) +from ray.rllib.tests.test_rollout_worker import (BadPolicy, MockPolicy, + MockEnv) from ray.tune.registry import register_env @@ -119,7 +119,7 @@ class MultiServing(ExternalEnv): class TestExternalEnv(unittest.TestCase): def testExternalEnvCompleteEpisodes(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleServing(MockEnv(25)), policy=MockPolicy, batch_steps=40, @@ -129,7 +129,7 @@ class TestExternalEnv(unittest.TestCase): self.assertEqual(batch.count, 50) def testExternalEnvTruncateEpisodes(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleServing(MockEnv(25)), policy=MockPolicy, batch_steps=40, @@ -139,7 +139,7 @@ class TestExternalEnv(unittest.TestCase): self.assertEqual(batch.count, 40) def testExternalEnvOffPolicy(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleOffPolicyServing(MockEnv(25), 42), policy=MockPolicy, batch_steps=40, @@ -151,7 +151,7 @@ class TestExternalEnv(unittest.TestCase): self.assertEqual(batch["actions"][-1], 42) def testExternalEnvBadActions(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleServing(MockEnv(25)), policy=BadPolicy, sample_async=True, @@ -196,7 +196,7 @@ class TestExternalEnv(unittest.TestCase): raise Exception("failed to improve reward") def testExternalEnvHorizonNotSupported(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleServing(MockEnv(25)), policy=MockPolicy, episode_horizon=20, diff --git a/python/ray/rllib/tests/test_external_multi_agent_env.py b/python/ray/rllib/tests/test_external_multi_agent_env.py index fcb3de634..be232c0bf 100644 --- a/python/ray/rllib/tests/test_external_multi_agent_env.py +++ b/python/ray/rllib/tests/test_external_multi_agent_env.py @@ -10,9 +10,10 @@ import unittest import ray from ray.rllib.agents.pg.pg_policy import PGTFPolicy from ray.rllib.optimizers import SyncSamplesOptimizer -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.env.external_multi_agent_env import ExternalMultiAgentEnv -from ray.rllib.tests.test_policy_evaluator import MockPolicy +from ray.rllib.tests.test_rollout_worker import MockPolicy from ray.rllib.tests.test_external_env import make_simple_serving from ray.rllib.tests.test_multi_agent_env import BasicMultiAgent, MultiCartpole from ray.rllib.evaluation.metrics import collect_metrics @@ -23,7 +24,7 @@ SimpleMultiServing = make_simple_serving(True, ExternalMultiAgentEnv) class TestExternalMultiAgentEnv(unittest.TestCase): def testExternalMultiAgentEnvCompleteEpisodes(self): agents = 4 - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)), policy=MockPolicy, batch_steps=40, @@ -35,7 +36,7 @@ class TestExternalMultiAgentEnv(unittest.TestCase): def testExternalMultiAgentEnvTruncateEpisodes(self): agents = 4 - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)), policy=MockPolicy, batch_steps=40, @@ -49,7 +50,7 @@ class TestExternalMultiAgentEnv(unittest.TestCase): agents = 2 act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: SimpleMultiServing(BasicMultiAgent(agents)), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -70,12 +71,12 @@ class TestExternalMultiAgentEnv(unittest.TestCase): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) - optimizer = SyncSamplesOptimizer(ev, []) + optimizer = SyncSamplesOptimizer(WorkerSet._from_existing(ev)) for i in range(100): optimizer.step() result = collect_metrics(ev) diff --git a/python/ray/rllib/tests/test_filters.py b/python/ray/rllib/tests/test_filters.py index f039c6c09..1446809eb 100644 --- a/python/ray/rllib/tests/test_filters.py +++ b/python/ray/rllib/tests/test_filters.py @@ -8,7 +8,7 @@ import numpy as np import ray from ray.rllib.utils.filter import RunningStat, MeanStdFilter from ray.rllib.utils import FilterManager -from ray.rllib.tests.mock_evaluator import _MockEvaluator +from ray.rllib.tests.mock_worker import _MockWorker class RunningStatTest(unittest.TestCase): @@ -89,8 +89,8 @@ class FilterManagerTest(unittest.TestCase): filt1.clear_buffer() self.assertEqual(filt1.buffer.n, 0) - RemoteEvaluator = ray.remote(_MockEvaluator) - remote_e = RemoteEvaluator.remote(sample_count=10) + RemoteWorker = ray.remote(_MockWorker) + remote_e = RemoteWorker.remote(sample_count=10) remote_e.sample.remote() FilterManager.synchronize({ diff --git a/python/ray/rllib/tests/test_multi_agent_env.py b/python/ray/rllib/tests/test_multi_agent_env.py index be4bfcd34..e69ba6b1f 100644 --- a/python/ray/rllib/tests/test_multi_agent_env.py +++ b/python/ray/rllib/tests/test_multi_agent_env.py @@ -12,11 +12,11 @@ from ray.rllib.agents.pg.pg_policy import PGTFPolicy from ray.rllib.agents.dqn.dqn_policy import DQNTFPolicy from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer, AsyncGradientsOptimizer) -from ray.rllib.tests.test_policy_evaluator import (MockEnv, MockEnv2, - MockPolicy) -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator +from ray.rllib.tests.test_rollout_worker import (MockEnv, MockEnv2, MockPolicy) +from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.policy.policy import Policy from ray.rllib.evaluation.metrics import collect_metrics +from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv from ray.rllib.env.multi_agent_env import MultiAgentEnv from ray.tune.registry import register_env @@ -327,7 +327,7 @@ class TestMultiAgentEnv(unittest.TestCase): def testMultiAgentSample(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -345,7 +345,7 @@ class TestMultiAgentEnv(unittest.TestCase): def testMultiAgentSampleSyncRemote(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -362,7 +362,7 @@ class TestMultiAgentEnv(unittest.TestCase): def testMultiAgentSampleAsyncRemote(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -378,7 +378,7 @@ class TestMultiAgentEnv(unittest.TestCase): def testMultiAgentSampleWithHorizon(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: BasicMultiAgent(5), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -393,7 +393,7 @@ class TestMultiAgentEnv(unittest.TestCase): def testSampleFromEarlyDoneEnv(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(2) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: EarlyDoneMultiAgent(), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -409,7 +409,7 @@ class TestMultiAgentEnv(unittest.TestCase): def testMultiAgentSampleRoundRobin(self): act_space = gym.spaces.Discrete(2) obs_space = gym.spaces.Discrete(10) - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: RoundRobinMultiAgent(5, increment_obs=True), policy={ "p0": (MockPolicy, obs_space, act_space, {}), @@ -458,7 +458,7 @@ class TestMultiAgentEnv(unittest.TestCase): def get_initial_state(self): return [{}] # empty dict - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=StatefulPolicy, batch_steps=5) @@ -503,7 +503,7 @@ class TestMultiAgentEnv(unittest.TestCase): single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MultiCartpole(2), policy={ "p0": (ModelBasedPolicy, obs_space, act_space, {}), @@ -587,7 +587,7 @@ class TestMultiAgentEnv(unittest.TestCase): "p1": (PGTFPolicy, obs_space, act_space, {}), "p2": (DQNTFPolicy, obs_space, act_space, dqn_config), } - ev = PolicyEvaluator( + worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], @@ -597,29 +597,30 @@ class TestMultiAgentEnv(unittest.TestCase): def policy_mapper(agent_id): return ["p1", "p2"][agent_id % 2] - remote_evs = [ - PolicyEvaluator.as_remote().remote( + remote_workers = [ + RolloutWorker.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=policy_mapper, batch_steps=50) ] else: - remote_evs = [] - optimizer = optimizer_cls(ev, remote_evs) + remote_workers = [] + workers = WorkerSet._from_existing(worker, remote_workers) + optimizer = optimizer_cls(workers) for i in range(200): - ev.foreach_policy(lambda p, _: p.set_epsilon( + worker.foreach_policy(lambda p, _: p.set_epsilon( max(0.02, 1 - i * .02)) if isinstance(p, DQNTFPolicy) else None) optimizer.step() - result = collect_metrics(ev, remote_evs) + result = collect_metrics(worker, remote_workers) if i % 20 == 0: def do_update(p): if isinstance(p, DQNTFPolicy): p.update_target() - ev.foreach_policy(lambda p, _: do_update(p)) + worker.foreach_policy(lambda p, _: do_update(p)) print("Iter {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) @@ -647,15 +648,16 @@ class TestMultiAgentEnv(unittest.TestCase): policies["pg_{}".format(i)] = (PGTFPolicy, obs_space, act_space, {}) policy_ids = list(policies.keys()) - ev = PolicyEvaluator( + worker = RolloutWorker( env_creator=lambda _: MultiCartpole(n), policy=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) - optimizer = SyncSamplesOptimizer(ev, []) + workers = WorkerSet._from_existing(worker, []) + optimizer = SyncSamplesOptimizer(workers) for i in range(100): optimizer.step() - result = collect_metrics(ev) + result = collect_metrics(worker) print("Iteration {}, rew {}".format(i, result["policy_reward_mean"])) print("Total reward", result["episode_reward_mean"]) diff --git a/python/ray/rllib/tests/test_optimizers.py b/python/ray/rllib/tests/test_optimizers.py index f851cfc33..a87a295cc 100644 --- a/python/ray/rllib/tests/test_optimizers.py +++ b/python/ray/rllib/tests/test_optimizers.py @@ -11,10 +11,11 @@ import ray from ray.rllib.agents.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo_policy import PPOTFPolicy from ray.rllib.evaluation import SampleBatch -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.optimizers import AsyncGradientsOptimizer, AsyncSamplesOptimizer from ray.rllib.optimizers.aso_tree_aggregator import TreeAggregator -from ray.rllib.tests.mock_evaluator import _MockEvaluator +from ray.rllib.tests.mock_worker import _MockWorker from ray.rllib.utils import try_import_tf tf = try_import_tf() @@ -26,11 +27,11 @@ class AsyncOptimizerTest(unittest.TestCase): def testBasic(self): ray.init(num_cpus=4) - local = _MockEvaluator() - remotes = ray.remote(_MockEvaluator) - remote_evaluators = [remotes.remote() for i in range(5)] - test_optimizer = AsyncGradientsOptimizer( - local, remote_evaluators, grads_per_step=10) + local = _MockWorker() + remotes = ray.remote(_MockWorker) + remote_workers = [remotes.remote() for i in range(5)] + workers = WorkerSet._from_existing(local, remote_workers) + test_optimizer = AsyncGradientsOptimizer(workers, grads_per_step=10) test_optimizer.step() self.assertTrue(all(local.get_weights() == 0)) @@ -117,30 +118,28 @@ class AsyncSamplesOptimizerTest(unittest.TestCase): def testSimple(self): local, remotes = self._make_evs() - optimizer = AsyncSamplesOptimizer(local, remotes) + workers = WorkerSet._from_existing(local, remotes) + optimizer = AsyncSamplesOptimizer(workers) self._wait_for(optimizer, 1000, 1000) def testMultiGPU(self): local, remotes = self._make_evs() - optimizer = AsyncSamplesOptimizer( - local, remotes, num_gpus=2, _fake_gpus=True) + workers = WorkerSet._from_existing(local, remotes) + optimizer = AsyncSamplesOptimizer(workers, num_gpus=2, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) def testMultiGPUParallelLoad(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer( - local, - remotes, - num_gpus=2, - num_data_loader_buffers=2, - _fake_gpus=True) + workers, num_gpus=2, num_data_loader_buffers=2, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) def testMultiplePasses(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer( - local, - remotes, + workers, minibatch_buffer_size=10, num_sgd_iter=10, sample_batch_size=10, @@ -151,9 +150,9 @@ class AsyncSamplesOptimizerTest(unittest.TestCase): def testReplay(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer( - local, - remotes, + workers, replay_buffer_num_slots=100, replay_proportion=10, sample_batch_size=10, @@ -168,9 +167,9 @@ class AsyncSamplesOptimizerTest(unittest.TestCase): def testReplayAndMultiplePasses(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) optimizer = AsyncSamplesOptimizer( - local, - remotes, + workers, minibatch_buffer_size=10, num_sgd_iter=10, replay_buffer_num_slots=100, @@ -189,45 +188,43 @@ class AsyncSamplesOptimizerTest(unittest.TestCase): def testMultiTierAggregationBadConf(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) aggregators = TreeAggregator.precreate_aggregators(4) - optimizer = AsyncSamplesOptimizer( - local, remotes, num_aggregation_workers=4) + optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=4) self.assertRaises(ValueError, lambda: optimizer.aggregator.init(aggregators)) def testMultiTierAggregation(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) aggregators = TreeAggregator.precreate_aggregators(1) - optimizer = AsyncSamplesOptimizer( - local, remotes, num_aggregation_workers=1) + optimizer = AsyncSamplesOptimizer(workers, num_aggregation_workers=1) optimizer.aggregator.init(aggregators) self._wait_for(optimizer, 1000, 1000) def testRejectBadConfigs(self): local, remotes = self._make_evs() + workers = WorkerSet._from_existing(local, remotes) self.assertRaises( ValueError, lambda: AsyncSamplesOptimizer( local, remotes, num_data_loader_buffers=2, minibatch_buffer_size=4)) optimizer = AsyncSamplesOptimizer( - local, - remotes, + workers, num_gpus=2, train_batch_size=100, sample_batch_size=50, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) optimizer = AsyncSamplesOptimizer( - local, - remotes, + workers, num_gpus=2, train_batch_size=100, sample_batch_size=25, _fake_gpus=True) self._wait_for(optimizer, 1000, 1000) optimizer = AsyncSamplesOptimizer( - local, - remotes, + workers, num_gpus=2, train_batch_size=100, sample_batch_size=74, @@ -238,12 +235,12 @@ class AsyncSamplesOptimizerTest(unittest.TestCase): def make_sess(): return tf.Session(config=tf.ConfigProto(device_count={"CPU": 2})) - local = PolicyEvaluator( + local = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=PPOTFPolicy, tf_session_creator=make_sess) remotes = [ - PolicyEvaluator.as_remote().remote( + RolloutWorker.as_remote().remote( env_creator=lambda _: gym.make("CartPole-v0"), policy=PPOTFPolicy, tf_session_creator=make_sess) diff --git a/python/ray/rllib/tests/test_perf.py b/python/ray/rllib/tests/test_perf.py index e31530f44..6ed02a0ff 100644 --- a/python/ray/rllib/tests/test_perf.py +++ b/python/ray/rllib/tests/test_perf.py @@ -7,8 +7,8 @@ import time import unittest import ray -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator -from ray.rllib.tests.test_policy_evaluator import MockPolicy +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.tests.test_rollout_worker import MockPolicy class TestPerf(unittest.TestCase): @@ -17,7 +17,7 @@ class TestPerf(unittest.TestCase): # 03/01/19: Samples per second 8610.164353268685 def testBaselinePerformance(self): for _ in range(20): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, batch_steps=100) diff --git a/python/ray/rllib/tests/test_policy_evaluator.py b/python/ray/rllib/tests/test_rollout_worker.py similarity index 94% rename from python/ray/rllib/tests/test_policy_evaluator.py rename to python/ray/rllib/tests/test_rollout_worker.py index dc0dcaff6..45b2fa015 100644 --- a/python/ray/rllib/tests/test_policy_evaluator.py +++ b/python/ray/rllib/tests/test_rollout_worker.py @@ -12,7 +12,7 @@ from collections import Counter import ray from ray.rllib.agents.pg import PGTrainer from ray.rllib.agents.a3c import A2CTrainer -from ray.rllib.evaluation.policy_evaluator import PolicyEvaluator +from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.policy.policy import Policy from ray.rllib.evaluation.postprocessing import compute_advantages @@ -129,9 +129,9 @@ class MockVectorEnv(VectorEnv): return self.envs -class TestPolicyEvaluator(unittest.TestCase): +class TestRolloutWorker(unittest.TestCase): def testBasic(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy) batch = ev.sample() for key in [ @@ -155,7 +155,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertGreater(batch["advantages"][0], 1) def testBatchIds(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy) batch1 = ev.sample() batch2 = ev.sample() @@ -213,11 +213,10 @@ class TestPolicyEvaluator(unittest.TestCase): "sample_batch_size": 5, "num_envs_per_worker": 2, }) - results = pg.optimizer.foreach_evaluator( - lambda ev: ev.sample_batch_size) - results2 = pg.optimizer.foreach_evaluator_with_index( + results = pg.workers.foreach_worker(lambda ev: ev.sample_batch_size) + results2 = pg.workers.foreach_worker_with_index( lambda ev, i: (i, ev.sample_batch_size)) - results3 = pg.optimizer.foreach_evaluator( + results3 = pg.workers.foreach_worker( lambda ev: ev.foreach_env(lambda env: 1)) self.assertEqual(results, [10, 10, 10]) self.assertEqual(results2, [(0, 10), (1, 10), (2, 10)]) @@ -225,7 +224,7 @@ class TestPolicyEvaluator(unittest.TestCase): def testRewardClipping(self): # clipping on - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy=MockPolicy, clip_rewards=True, @@ -235,7 +234,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(result["episode_reward_mean"], 1000) # clipping off - ev2 = PolicyEvaluator( + ev2 = RolloutWorker( env_creator=lambda _: MockEnv2(episode_length=10), policy=MockPolicy, clip_rewards=False, @@ -245,7 +244,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(result2["episode_reward_mean"], 1000) def testHardHorizon(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes", @@ -259,7 +258,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(sum(samples["dones"]), 3) def testSoftHorizon(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes", @@ -273,11 +272,11 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(sum(samples["dones"]), 1) def testMetrics(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") - remote_ev = PolicyEvaluator.as_remote().remote( + remote_ev = RolloutWorker.as_remote().remote( env_creator=lambda _: MockEnv(episode_length=10), policy=MockPolicy, batch_mode="complete_episodes") @@ -288,7 +287,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(result["episode_reward_mean"], 10) def testAsync(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), sample_async=True, policy=MockPolicy) @@ -298,7 +297,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertGreater(batch["advantages"][0], 1) def testAutoVectorization(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda cfg: MockEnv(episode_length=20, config=cfg), policy=MockPolicy, batch_mode="truncate_episodes", @@ -321,7 +320,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(indices, [0, 1, 2, 3, 4, 5, 6, 7]) def testBatchesLargerWhenVectorized(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(episode_length=8), policy=MockPolicy, batch_mode="truncate_episodes", @@ -336,7 +335,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(result["episodes_this_iter"], 4) def testVectorEnvSupport(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockVectorEnv(episode_length=20, num_envs=8), policy=MockPolicy, batch_mode="truncate_episodes", @@ -353,7 +352,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(result["episodes_this_iter"], 8) def testTruncateEpisodes(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(10), policy=MockPolicy, batch_steps=15, @@ -362,7 +361,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(batch.count, 15) def testCompleteEpisodes(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(10), policy=MockPolicy, batch_steps=5, @@ -371,7 +370,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertEqual(batch.count, 10) def testCompleteEpisodesPacking(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: MockEnv(10), policy=MockPolicy, batch_steps=15, @@ -383,7 +382,7 @@ class TestPolicyEvaluator(unittest.TestCase): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) def testFilterSync(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, sample_async=True, @@ -396,7 +395,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertNotEqual(obs_f.buffer.n, 0) def testGetFilters(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, sample_async=True, @@ -411,7 +410,7 @@ class TestPolicyEvaluator(unittest.TestCase): self.assertGreaterEqual(obs_f2.buffer.n, obs_f.buffer.n) def testSyncFilter(self): - ev = PolicyEvaluator( + ev = RolloutWorker( env_creator=lambda _: gym.make("CartPole-v0"), policy=MockPolicy, sample_async=True, diff --git a/python/ray/rllib/utils/actors.py b/python/ray/rllib/utils/actors.py index b0e712f69..8907aa5c9 100644 --- a/python/ray/rllib/utils/actors.py +++ b/python/ray/rllib/utils/actors.py @@ -58,15 +58,15 @@ class TaskPool(object): remaining.append((worker, obj_id)) self._fetching = remaining - def reset_evaluators(self, evaluators): - """Notify that some evaluators may be removed.""" + def reset_workers(self, workers): + """Notify that some workers may be removed.""" for obj_id, ev in self._tasks.copy().items(): - if ev not in evaluators: + if ev not in workers: del self._tasks[obj_id] del self._objects[obj_id] ok = [] for ev, obj_id in self._fetching: - if ev in evaluators: + if ev in workers: ok.append((ev, obj_id)) self._fetching = ok