diff --git a/python/ray/rllib/a3c/a3c.py b/python/ray/rllib/a3c/a3c.py index b04351be3..c3813ac2d 100644 --- a/python/ray/rllib/a3c/a3c.py +++ b/python/ray/rllib/a3c/a3c.py @@ -7,7 +7,7 @@ import os import ray from ray.rllib.agent import Agent -from ray.rllib.optimizers import AsyncOptimizer +from ray.rllib.optimizers import AsyncGradientsOptimizer from ray.rllib.utils import FilterManager from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \ collect_metrics @@ -131,7 +131,7 @@ class A3CAgent(Agent): worker_index=i+1) for i in range(self.config["num_workers"])] - self.optimizer = AsyncOptimizer( + self.optimizer = AsyncGradientsOptimizer( self.config["optimizer"], self.local_evaluator, self.remote_evaluators) diff --git a/python/ray/rllib/bc/bc.py b/python/ray/rllib/bc/bc.py index 501f53521..1cc05e599 100644 --- a/python/ray/rllib/bc/bc.py +++ b/python/ray/rllib/bc/bc.py @@ -6,7 +6,7 @@ import ray from ray.rllib.agent import Agent from ray.rllib.bc.bc_evaluator import BCEvaluator, GPURemoteBCEvaluator, \ RemoteBCEvaluator -from ray.rllib.optimizers import AsyncOptimizer +from ray.rllib.optimizers import AsyncGradientsOptimizer from ray.tune.result import TrainingResult from ray.tune.trial import Resources @@ -71,7 +71,7 @@ class BCAgent(Agent): self.remote_evaluators = [ remote_cls.remote(self.env_creator, self.config, self.logdir) for _ in range(self.config["num_workers"])] - self.optimizer = AsyncOptimizer( + self.optimizer = AsyncGradientsOptimizer( self.config["optimizer"], self.local_evaluator, self.remote_evaluators) diff --git a/python/ray/rllib/ddpg/apex.py b/python/ray/rllib/ddpg/apex.py index 7a3b5c598..8ede5109f 100644 --- a/python/ray/rllib/ddpg/apex.py +++ b/python/ray/rllib/ddpg/apex.py @@ -8,7 +8,7 @@ from ray.utils import merge_dicts APEX_DDPG_DEFAULT_CONFIG = merge_dicts( DDPG_CONFIG, { - "optimizer_class": "ApexOptimizer", + "optimizer_class": "AsyncSamplesOptimizer", "optimizer_config": merge_dicts( DDPG_CONFIG["optimizer_config"], { diff --git a/python/ray/rllib/ddpg/ddpg.py b/python/ray/rllib/ddpg/ddpg.py index adb323843..abd9f3d81 100644 --- a/python/ray/rllib/ddpg/ddpg.py +++ b/python/ray/rllib/ddpg/ddpg.py @@ -102,7 +102,7 @@ DEFAULT_CONFIG = { # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Optimizer class to use. - "optimizer_class": "LocalSyncReplayOptimizer", + "optimizer_class": "SyncReplayOptimizer", # Config to pass to the optimizer. "optimizer_config": {}, # Whether to use a distribution of epsilons across workers for exploration. diff --git a/python/ray/rllib/dqn/apex.py b/python/ray/rllib/dqn/apex.py index bfb58d02a..d12754b89 100644 --- a/python/ray/rllib/dqn/apex.py +++ b/python/ray/rllib/dqn/apex.py @@ -9,7 +9,7 @@ from ray.utils import merge_dicts APEX_DEFAULT_CONFIG = merge_dicts( DQN_CONFIG, { - "optimizer_class": "ApexOptimizer", + "optimizer_class": "AsyncSamplesOptimizer", "optimizer_config": merge_dicts( DQN_CONFIG["optimizer_config"], { diff --git a/python/ray/rllib/dqn/dqn.py b/python/ray/rllib/dqn/dqn.py index 960b30185..8c0e55391 100644 --- a/python/ray/rllib/dqn/dqn.py +++ b/python/ray/rllib/dqn/dqn.py @@ -96,7 +96,7 @@ DEFAULT_CONFIG = { # Whether to allocate CPUs for workers (if > 0). "num_cpus_per_worker": 1, # Optimizer class to use. - "optimizer_class": "LocalSyncReplayOptimizer", + "optimizer_class": "SyncReplayOptimizer", # Config to pass to the optimizer. "optimizer_config": {}, # Whether to use a distribution of epsilons across workers for exploration. diff --git a/python/ray/rllib/optimizers/__init__.py b/python/ray/rllib/optimizers/__init__.py index 9bcd38899..70d70a197 100644 --- a/python/ray/rllib/optimizers/__init__.py +++ b/python/ray/rllib/optimizers/__init__.py @@ -1,14 +1,15 @@ -from ray.rllib.optimizers.apex_optimizer import ApexOptimizer -from ray.rllib.optimizers.async_optimizer import AsyncOptimizer -from ray.rllib.optimizers.local_sync import LocalSyncOptimizer -from ray.rllib.optimizers.local_sync_replay import LocalSyncReplayOptimizer -from ray.rllib.optimizers.multi_gpu import LocalMultiGPUOptimizer +from ray.rllib.optimizers.async_samples_optimizer import AsyncSamplesOptimizer +from ray.rllib.optimizers.async_gradients_optimizer import \ + AsyncGradientsOptimizer +from ray.rllib.optimizers.sync_samples_optimizer import SyncSamplesOptimizer +from ray.rllib.optimizers.sync_replay_optimizer import SyncReplayOptimizer +from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer from ray.rllib.optimizers.sample_batch import SampleBatch, MultiAgentBatch from ray.rllib.optimizers.policy_evaluator import PolicyEvaluator, \ TFMultiGPUSupport __all__ = [ - "ApexOptimizer", "AsyncOptimizer", "LocalSyncOptimizer", - "LocalSyncReplayOptimizer", "LocalMultiGPUOptimizer", "SampleBatch", + "AsyncSamplesOptimizer", "AsyncGradientsOptimizer", "SyncSamplesOptimizer", + "SyncReplayOptimizer", "LocalMultiGPUOptimizer", "SampleBatch", "PolicyEvaluator", "TFMultiGPUSupport", "MultiAgentBatch"] diff --git a/python/ray/rllib/optimizers/async_optimizer.py b/python/ray/rllib/optimizers/async_gradients_optimizer.py similarity index 98% rename from python/ray/rllib/optimizers/async_optimizer.py rename to python/ray/rllib/optimizers/async_gradients_optimizer.py index 2fd253e95..7ec11e2a6 100644 --- a/python/ray/rllib/optimizers/async_optimizer.py +++ b/python/ray/rllib/optimizers/async_gradients_optimizer.py @@ -7,7 +7,7 @@ from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer from ray.rllib.utils.timer import TimerStat -class AsyncOptimizer(PolicyOptimizer): +class AsyncGradientsOptimizer(PolicyOptimizer): """An asynchronous RL optimizer, e.g. for implementing A3C. This optimizer asynchronously pulls and applies gradients from remote diff --git a/python/ray/rllib/optimizers/apex_optimizer.py b/python/ray/rllib/optimizers/async_samples_optimizer.py similarity index 98% rename from python/ray/rllib/optimizers/apex_optimizer.py rename to python/ray/rllib/optimizers/async_samples_optimizer.py index cb07342f3..8e4772909 100644 --- a/python/ray/rllib/optimizers/apex_optimizer.py +++ b/python/ray/rllib/optimizers/async_samples_optimizer.py @@ -135,8 +135,8 @@ class LearnerThread(threading.Thread): self.weights_updated = True -class ApexOptimizer(PolicyOptimizer): - """Main event loop of the Ape-X optimizer. +class AsyncSamplesOptimizer(PolicyOptimizer): + """Main event loop of the Ape-X optimizer (async sampling with replay). This class coordinates the data transfers between the learner thread, remote evaluators (Ape-X actors), and replay buffer actors. diff --git a/python/ray/rllib/optimizers/multi_gpu.py b/python/ray/rllib/optimizers/multi_gpu_optimizer.py similarity index 100% rename from python/ray/rllib/optimizers/multi_gpu.py rename to python/ray/rllib/optimizers/multi_gpu_optimizer.py diff --git a/python/ray/rllib/optimizers/local_sync_replay.py b/python/ray/rllib/optimizers/sync_replay_optimizer.py similarity index 99% rename from python/ray/rllib/optimizers/local_sync_replay.py rename to python/ray/rllib/optimizers/sync_replay_optimizer.py index b6545cb85..771695472 100644 --- a/python/ray/rllib/optimizers/local_sync_replay.py +++ b/python/ray/rllib/optimizers/sync_replay_optimizer.py @@ -16,7 +16,7 @@ from ray.rllib.utils.filter import RunningStat from ray.rllib.utils.timer import TimerStat -class LocalSyncReplayOptimizer(PolicyOptimizer): +class SyncReplayOptimizer(PolicyOptimizer): """Variant of the local sync optimizer that supports replay (for DQN). This optimizer requires that policy evaluators return an additional diff --git a/python/ray/rllib/optimizers/local_sync.py b/python/ray/rllib/optimizers/sync_samples_optimizer.py similarity index 97% rename from python/ray/rllib/optimizers/local_sync.py rename to python/ray/rllib/optimizers/sync_samples_optimizer.py index 3f71cce4e..2995a1034 100644 --- a/python/ray/rllib/optimizers/local_sync.py +++ b/python/ray/rllib/optimizers/sync_samples_optimizer.py @@ -9,7 +9,7 @@ from ray.rllib.utils.filter import RunningStat from ray.rllib.utils.timer import TimerStat -class LocalSyncOptimizer(PolicyOptimizer): +class SyncSamplesOptimizer(PolicyOptimizer): """A simple synchronous RL optimizer. In each step, this optimizer pulls samples from a number of remote diff --git a/python/ray/rllib/pg/pg.py b/python/ray/rllib/pg/pg.py index 7d78c3c38..1926d4c1c 100644 --- a/python/ray/rllib/pg/pg.py +++ b/python/ray/rllib/pg/pg.py @@ -3,7 +3,7 @@ from __future__ import division from __future__ import print_function from ray.rllib.agent import Agent -from ray.rllib.optimizers import LocalSyncOptimizer +from ray.rllib.optimizers import SyncSamplesOptimizer from ray.rllib.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \ collect_metrics @@ -54,7 +54,7 @@ class PGAgent(Agent): return Resources(cpu=1, gpu=0, extra_cpu=cf["num_workers"]) def _init(self): - self.optimizer = LocalSyncOptimizer.make( + self.optimizer = SyncSamplesOptimizer.make( evaluator_cls=CommonPolicyEvaluator, evaluator_args={ "env_creator": self.env_creator, diff --git a/python/ray/rllib/ppo/ppo.py b/python/ray/rllib/ppo/ppo.py index a609b2429..5ae3e8dbf 100644 --- a/python/ray/rllib/ppo/ppo.py +++ b/python/ray/rllib/ppo/ppo.py @@ -13,7 +13,7 @@ from ray.tune.trial import Resources from ray.rllib.agent import Agent from ray.rllib.utils import FilterManager from ray.rllib.ppo.ppo_evaluator import PPOEvaluator -from ray.rllib.optimizers.multi_gpu import LocalMultiGPUOptimizer +from ray.rllib.optimizers.multi_gpu_optimizer import LocalMultiGPUOptimizer DEFAULT_CONFIG = { # Discount factor of the MDP diff --git a/python/ray/rllib/test/test_multi_agent_env.py b/python/ray/rllib/test/test_multi_agent_env.py index a37810568..7424d0eba 100644 --- a/python/ray/rllib/test/test_multi_agent_env.py +++ b/python/ray/rllib/test/test_multi_agent_env.py @@ -10,8 +10,8 @@ import ray from ray.rllib.pg import PGAgent from ray.rllib.pg.pg_policy_graph import PGPolicyGraph from ray.rllib.dqn.dqn_policy_graph import DQNPolicyGraph -from ray.rllib.optimizers import LocalSyncOptimizer, \ - LocalSyncReplayOptimizer, AsyncOptimizer +from ray.rllib.optimizers import SyncSamplesOptimizer, \ + SyncReplayOptimizer, AsyncGradientsOptimizer from ray.rllib.test.test_common_policy_evaluator import MockEnv, MockEnv2, \ MockPolicyGraph from ray.rllib.utils.common_policy_evaluator import CommonPolicyEvaluator, \ @@ -270,7 +270,7 @@ class TestMultiAgentEnv(unittest.TestCase): act_space = env.action_space obs_space = env.observation_space dqn_config = {"gamma": 0.95, "n_step": 3} - if optimizer_cls == LocalSyncReplayOptimizer: + if optimizer_cls == SyncReplayOptimizer: # TODO: support replay with non-DQN graphs. Currently this can't # happen since the replay buffer doesn't encode extra fields like # "advantages" that PG uses. @@ -288,7 +288,7 @@ class TestMultiAgentEnv(unittest.TestCase): policy_graph=policies, policy_mapping_fn=lambda agent_id: ["p1", "p2"][agent_id % 2], batch_steps=50) - if optimizer_cls == AsyncOptimizer: + if optimizer_cls == AsyncGradientsOptimizer: remote_evs = [CommonPolicyEvaluator.as_remote().remote( env_creator=lambda _: MultiCartpole(n), policy_graph=policies, @@ -315,13 +315,13 @@ class TestMultiAgentEnv(unittest.TestCase): raise Exception("failed to improve reward") def testMultiAgentSyncOptimizer(self): - self._testWithOptimizer(LocalSyncOptimizer) + self._testWithOptimizer(SyncSamplesOptimizer) - def testMultiAgentAsyncOptimizer(self): - self._testWithOptimizer(AsyncOptimizer) + def testMultiAgentAsyncGradientsOptimizer(self): + self._testWithOptimizer(AsyncGradientsOptimizer) def testMultiAgentReplayOptimizer(self): - self._testWithOptimizer(LocalSyncReplayOptimizer) + self._testWithOptimizer(SyncReplayOptimizer) def testTrainMultiCartpoleManyPolicies(self): n = 20 @@ -338,7 +338,7 @@ class TestMultiAgentEnv(unittest.TestCase): policy_graph=policies, policy_mapping_fn=lambda agent_id: random.choice(policy_ids), batch_steps=100) - optimizer = LocalSyncOptimizer({}, ev, []) + optimizer = SyncSamplesOptimizer({}, ev, []) for i in range(100): optimizer.step() result = collect_metrics(ev) diff --git a/python/ray/rllib/test/test_optimizers.py b/python/ray/rllib/test/test_optimizers.py index 3118f7956..a9a109aa3 100644 --- a/python/ray/rllib/test/test_optimizers.py +++ b/python/ray/rllib/test/test_optimizers.py @@ -8,7 +8,7 @@ import numpy as np import ray from ray.rllib.test.mock_evaluator import _MockEvaluator -from ray.rllib.optimizers import AsyncOptimizer, SampleBatch +from ray.rllib.optimizers import AsyncGradientsOptimizer, SampleBatch class AsyncOptimizerTest(unittest.TestCase): @@ -20,7 +20,7 @@ class AsyncOptimizerTest(unittest.TestCase): local = _MockEvaluator() remotes = ray.remote(_MockEvaluator) remote_evaluators = [remotes.remote() for i in range(5)] - test_optimizer = AsyncOptimizer({ + test_optimizer = AsyncGradientsOptimizer({ "grads_per_step": 10 }, local, remote_evaluators) test_optimizer.step() diff --git a/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml b/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml index 08e0c05af..62b702551 100644 --- a/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/halfcheetah-ddpg.yaml @@ -46,7 +46,7 @@ halfcheetah-ddpg: # === Parallelism === num_workers: 0 num_gpus_per_worker: 0 - optimizer_class: "LocalSyncReplayOptimizer" + optimizer_class: "SyncReplayOptimizer" optimizer_config: {} per_worker_exploration: False worker_side_prioritization: False diff --git a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml index 34d588a91..e626139c2 100644 --- a/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/mountaincarcontinuous-ddpg.yaml @@ -46,7 +46,7 @@ mountaincarcontinuous-ddpg: # === Parallelism === num_workers: 0 num_gpus_per_worker: 0 - optimizer_class: "LocalSyncReplayOptimizer" + optimizer_class: "SyncReplayOptimizer" optimizer_config: {} per_worker_exploration: False worker_side_prioritization: False diff --git a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml index 01efc1466..9764f33cb 100644 --- a/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml +++ b/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml @@ -46,7 +46,7 @@ pendulum-ddpg: # === Parallelism === num_workers: 0 num_gpus_per_worker: 0 - optimizer_class: "LocalSyncReplayOptimizer" + optimizer_class: "SyncReplayOptimizer" optimizer_config: {} per_worker_exploration: False worker_side_prioritization: False diff --git a/python/ray/rllib/utils/common_policy_evaluator.py b/python/ray/rllib/utils/common_policy_evaluator.py index bb1891629..14d374d1a 100644 --- a/python/ray/rllib/utils/common_policy_evaluator.py +++ b/python/ray/rllib/utils/common_policy_evaluator.py @@ -83,7 +83,7 @@ class CommonPolicyEvaluator(PolicyEvaluator): "dones": [[...]], "new_obs": [[...]]}) # Creating policy evaluators using optimizer_cls.make(). - >>> optimizer = LocalSyncOptimizer.make( + >>> optimizer = SyncSamplesOptimizer.make( evaluator_cls=CommonPolicyEvaluator, evaluator_args={ "env_creator": lambda _: gym.make("CartPole-v0"),