mirror of
https://github.com/wassname/ray.git
synced 2026-06-28 19:01:10 +08:00
[rllib] Update RLlib to work with new actor scheduling behavior (#1754)
* Mon Mar 19 21:23:01 PDT 2018 * Mon Mar 19 21:23:07 PDT 2018 * Mon Mar 19 21:30:49 PDT 2018 * Mon Mar 19 21:32:05 PDT 2018 * Mon Mar 19 21:35:43 PDT 2018 * fix cpu limits * Mon Mar 19 22:25:07 PDT 2018
This commit is contained in:
@@ -22,7 +22,6 @@ APEX_DEFAULT_CONFIG = dict(DQN_CONFIG, **dict(
|
||||
timesteps_per_iteration=25000,
|
||||
per_worker_exploration=True,
|
||||
worker_side_prioritization=True,
|
||||
force_evaluators_remote=False, # consider enabling for large clusters
|
||||
))
|
||||
|
||||
|
||||
@@ -30,8 +29,7 @@ class ApexAgent(DQNAgent):
|
||||
"""DQN variant that uses the Ape-X distributed policy optimizer.
|
||||
|
||||
By default, this is configured for a large single node (32 cores). For
|
||||
running in a large cluster, increase `num_workers` and consider setting
|
||||
`force_evaluators_remote` to move workers off of the head node.
|
||||
running in a large cluster, increase the `num_workers` config var.
|
||||
"""
|
||||
|
||||
_agent_name = "APEX"
|
||||
|
||||
@@ -11,7 +11,6 @@ import tensorflow as tf
|
||||
import ray
|
||||
from ray.rllib import optimizers
|
||||
from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
|
||||
from ray.rllib.utils.actors import drop_colocated
|
||||
from ray.rllib.agent import Agent
|
||||
from ray.tune.result import TrainingResult
|
||||
|
||||
@@ -114,9 +113,7 @@ DEFAULT_CONFIG = dict(
|
||||
# Whether to use a distribution of epsilons across workers for exploration.
|
||||
per_worker_exploration=False,
|
||||
# Whether to compute priorities on workers.
|
||||
worker_side_prioritization=False,
|
||||
# Whether to force evaluator actors to be placed on remote machines.
|
||||
force_evaluators_remote=False)
|
||||
worker_side_prioritization=False)
|
||||
|
||||
|
||||
class DQNAgent(Agent):
|
||||
@@ -137,9 +134,6 @@ class DQNAgent(Agent):
|
||||
i)
|
||||
for i in range(self.config["num_workers"])]
|
||||
|
||||
if self.config["force_evaluators_remote"]:
|
||||
self.remote_evaluators = drop_colocated(self.remote_evaluators)
|
||||
|
||||
for k in OPTIMIZER_SHARED_CONFIGS:
|
||||
if k not in self.config["optimizer_config"]:
|
||||
self.config["optimizer_config"][k] = self.config[k]
|
||||
|
||||
@@ -18,7 +18,7 @@ import ray
|
||||
from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
|
||||
from ray.rllib.optimizers.replay_buffer import PrioritizedReplayBuffer
|
||||
from ray.rllib.optimizers.sample_batch import SampleBatch
|
||||
from ray.rllib.utils.actors import TaskPool, create_colocated
|
||||
from ray.rllib.utils.actors import TaskPool
|
||||
from ray.rllib.utils.timer import TimerStat
|
||||
from ray.rllib.utils.window_stat import WindowStat
|
||||
|
||||
@@ -163,12 +163,15 @@ class ApexOptimizer(PolicyOptimizer):
|
||||
self.learner = LearnerThread(self.local_evaluator)
|
||||
self.learner.start()
|
||||
|
||||
self.replay_actors = create_colocated(
|
||||
ReplayActor,
|
||||
[num_replay_buffer_shards, learning_starts, buffer_size,
|
||||
train_batch_size, prioritized_replay_alpha,
|
||||
prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
|
||||
num_replay_buffer_shards)
|
||||
# TODO(ekl) use create_colocated() for these actors once
|
||||
# https://github.com/ray-project/ray/issues/1734 is fixed
|
||||
self.replay_actors = [
|
||||
ReplayActor.remote(
|
||||
num_replay_buffer_shards, learning_starts, buffer_size,
|
||||
train_batch_size, prioritized_replay_alpha,
|
||||
prioritized_replay_beta, prioritized_replay_eps, clip_rewards)
|
||||
for _ in range(num_replay_buffer_shards)
|
||||
]
|
||||
assert len(self.remote_evaluators) > 0
|
||||
|
||||
# Stats
|
||||
|
||||
@@ -5,7 +5,7 @@ cartpole-ppo:
|
||||
episode_reward_mean: 200
|
||||
time_total_s: 180
|
||||
resources:
|
||||
cpu: 2
|
||||
cpu: 3
|
||||
driver_cpu_limit: 1
|
||||
config:
|
||||
num_workers: 2
|
||||
|
||||
@@ -2,8 +2,8 @@ hopper-ppo:
|
||||
env: Hopper-v1
|
||||
run: PPO
|
||||
resources:
|
||||
cpu: 64
|
||||
cpu: 65
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
driver_cpu_limit: 1
|
||||
driver_gpu_limit: 4
|
||||
config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
|
||||
|
||||
@@ -2,8 +2,8 @@ humanoid-es:
|
||||
env: Humanoid-v1
|
||||
run: ES
|
||||
resources:
|
||||
cpu: 100
|
||||
driver_cpu_limit: 4
|
||||
cpu: 101
|
||||
driver_cpu_limit: 1
|
||||
stop:
|
||||
episode_reward_mean: 6000
|
||||
config:
|
||||
|
||||
@@ -4,8 +4,8 @@ humanoid-ppo-gae:
|
||||
stop:
|
||||
episode_reward_mean: 6000
|
||||
resources:
|
||||
cpu: 64
|
||||
cpu: 65
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
driver_cpu_limit: 1
|
||||
config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ humanoid-ppo:
|
||||
stop:
|
||||
episode_reward_mean: 6000
|
||||
resources:
|
||||
cpu: 64
|
||||
cpu: 65
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
driver_cpu_limit: 1
|
||||
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
|
||||
|
||||
@@ -3,9 +3,11 @@ pendulum-ppo:
|
||||
env: Pendulum-v0
|
||||
run: PPO
|
||||
resources:
|
||||
cpu: 4
|
||||
cpu: 5
|
||||
driver_cpu_limit: 1
|
||||
config:
|
||||
timesteps_per_batch: 2048
|
||||
num_workers: 4
|
||||
lambda: 0.1
|
||||
gamma: 0.95
|
||||
sgd_stepsize: 0.0003
|
||||
|
||||
@@ -6,10 +6,10 @@ pong-apex:
|
||||
run: APEX
|
||||
resources:
|
||||
cpu:
|
||||
eval: spec.config.num_workers
|
||||
eval: 1 + spec.config.num_workers
|
||||
driver_cpu_limit: 1
|
||||
gpu: 1
|
||||
config:
|
||||
force_evaluators_remote: True # set to False if you're running on a single node
|
||||
target_network_update_freq: 50000
|
||||
num_workers: 32
|
||||
lr: .0001
|
||||
|
||||
@@ -9,11 +9,13 @@ pong-deterministic-ppo:
|
||||
env: PongDeterministic-v4
|
||||
run: PPO
|
||||
resources:
|
||||
cpu: 6
|
||||
cpu: 5
|
||||
gpu: 1
|
||||
driver_cpu_limit: 1
|
||||
stop:
|
||||
episode_reward_mean: 21
|
||||
config:
|
||||
gamma: 0.99
|
||||
num_workers: 4
|
||||
num_sgd_iter: 20
|
||||
devices: ["/gpu:0"]
|
||||
|
||||
@@ -2,7 +2,7 @@ walker2d-v1-ppo:
|
||||
env: Walker2d-v1
|
||||
run: PPO
|
||||
resources:
|
||||
cpu: 64
|
||||
cpu: 65
|
||||
gpu: 4
|
||||
driver_cpu_limit: 4
|
||||
driver_cpu_limit: 1
|
||||
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}
|
||||
|
||||
Reference in New Issue
Block a user