[rllib] Update RLlib to work with new actor scheduling behavior (#1754)

* Mon Mar 19 21:23:01 PDT 2018

* Mon Mar 19 21:23:07 PDT 2018

* Mon Mar 19 21:30:49 PDT 2018

* Mon Mar 19 21:32:05 PDT 2018

* Mon Mar 19 21:35:43 PDT 2018

* fix cpu limits

* Mon Mar 19 22:25:07 PDT 2018
This commit is contained in:
Eric Liang
2018-03-20 19:29:52 -07:00
committed by GitHub
parent 4bccabd910
commit b41bdcefa0
12 changed files with 31 additions and 32 deletions
+1 -3
View File
@@ -22,7 +22,6 @@ APEX_DEFAULT_CONFIG = dict(DQN_CONFIG, **dict(
timesteps_per_iteration=25000,
per_worker_exploration=True,
worker_side_prioritization=True,
force_evaluators_remote=False, # consider enabling for large clusters
))
@@ -30,8 +29,7 @@ class ApexAgent(DQNAgent):
"""DQN variant that uses the Ape-X distributed policy optimizer.
By default, this is configured for a large single node (32 cores). For
running in a large cluster, increase `num_workers` and consider setting
`force_evaluators_remote` to move workers off of the head node.
running in a large cluster, increase the `num_workers` config var.
"""
_agent_name = "APEX"
+1 -7
View File
@@ -11,7 +11,6 @@ import tensorflow as tf
import ray
from ray.rllib import optimizers
from ray.rllib.dqn.dqn_evaluator import DQNEvaluator
from ray.rllib.utils.actors import drop_colocated
from ray.rllib.agent import Agent
from ray.tune.result import TrainingResult
@@ -114,9 +113,7 @@ DEFAULT_CONFIG = dict(
# Whether to use a distribution of epsilons across workers for exploration.
per_worker_exploration=False,
# Whether to compute priorities on workers.
worker_side_prioritization=False,
# Whether to force evaluator actors to be placed on remote machines.
force_evaluators_remote=False)
worker_side_prioritization=False)
class DQNAgent(Agent):
@@ -137,9 +134,6 @@ class DQNAgent(Agent):
i)
for i in range(self.config["num_workers"])]
if self.config["force_evaluators_remote"]:
self.remote_evaluators = drop_colocated(self.remote_evaluators)
for k in OPTIMIZER_SHARED_CONFIGS:
if k not in self.config["optimizer_config"]:
self.config["optimizer_config"][k] = self.config[k]
+10 -7
View File
@@ -18,7 +18,7 @@ import ray
from ray.rllib.optimizers.policy_optimizer import PolicyOptimizer
from ray.rllib.optimizers.replay_buffer import PrioritizedReplayBuffer
from ray.rllib.optimizers.sample_batch import SampleBatch
from ray.rllib.utils.actors import TaskPool, create_colocated
from ray.rllib.utils.actors import TaskPool
from ray.rllib.utils.timer import TimerStat
from ray.rllib.utils.window_stat import WindowStat
@@ -163,12 +163,15 @@ class ApexOptimizer(PolicyOptimizer):
self.learner = LearnerThread(self.local_evaluator)
self.learner.start()
self.replay_actors = create_colocated(
ReplayActor,
[num_replay_buffer_shards, learning_starts, buffer_size,
train_batch_size, prioritized_replay_alpha,
prioritized_replay_beta, prioritized_replay_eps, clip_rewards],
num_replay_buffer_shards)
# TODO(ekl) use create_colocated() for these actors once
# https://github.com/ray-project/ray/issues/1734 is fixed
self.replay_actors = [
ReplayActor.remote(
num_replay_buffer_shards, learning_starts, buffer_size,
train_batch_size, prioritized_replay_alpha,
prioritized_replay_beta, prioritized_replay_eps, clip_rewards)
for _ in range(num_replay_buffer_shards)
]
assert len(self.remote_evaluators) > 0
# Stats
@@ -5,7 +5,7 @@ cartpole-ppo:
episode_reward_mean: 200
time_total_s: 180
resources:
cpu: 2
cpu: 3
driver_cpu_limit: 1
config:
num_workers: 2
@@ -2,8 +2,8 @@ hopper-ppo:
env: Hopper-v1
run: PPO
resources:
cpu: 64
cpu: 65
gpu: 4
driver_cpu_limit: 4
driver_cpu_limit: 1
driver_gpu_limit: 4
config: {"gamma": 0.995, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 160000, "num_workers": 64}
@@ -2,8 +2,8 @@ humanoid-es:
env: Humanoid-v1
run: ES
resources:
cpu: 100
driver_cpu_limit: 4
cpu: 101
driver_cpu_limit: 1
stop:
episode_reward_mean: 6000
config:
@@ -4,8 +4,8 @@ humanoid-ppo-gae:
stop:
episode_reward_mean: 6000
resources:
cpu: 64
cpu: 65
gpu: 4
driver_cpu_limit: 4
driver_cpu_limit: 1
config: {"lambda": 0.95, "clip_param": 0.2, "kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "horizon": 5000, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "write_logs": false}
@@ -4,7 +4,7 @@ humanoid-ppo:
stop:
episode_reward_mean: 6000
resources:
cpu: 64
cpu: 65
gpu: 4
driver_cpu_limit: 4
driver_cpu_limit: 1
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64, "model": {"free_log_std": true}, "use_gae": false}
@@ -3,9 +3,11 @@ pendulum-ppo:
env: Pendulum-v0
run: PPO
resources:
cpu: 4
cpu: 5
driver_cpu_limit: 1
config:
timesteps_per_batch: 2048
num_workers: 4
lambda: 0.1
gamma: 0.95
sgd_stepsize: 0.0003
@@ -6,10 +6,10 @@ pong-apex:
run: APEX
resources:
cpu:
eval: spec.config.num_workers
eval: 1 + spec.config.num_workers
driver_cpu_limit: 1
gpu: 1
config:
force_evaluators_remote: True # set to False if you're running on a single node
target_network_update_freq: 50000
num_workers: 32
lr: .0001
@@ -9,11 +9,13 @@ pong-deterministic-ppo:
env: PongDeterministic-v4
run: PPO
resources:
cpu: 6
cpu: 5
gpu: 1
driver_cpu_limit: 1
stop:
episode_reward_mean: 21
config:
gamma: 0.99
num_workers: 4
num_sgd_iter: 20
devices: ["/gpu:0"]
@@ -2,7 +2,7 @@ walker2d-v1-ppo:
env: Walker2d-v1
run: PPO
resources:
cpu: 64
cpu: 65
gpu: 4
driver_cpu_limit: 4
driver_cpu_limit: 1
config: {"kl_coeff": 1.0, "num_sgd_iter": 20, "sgd_stepsize": .0001, "sgd_batchsize": 32768, "devices": ["/gpu:0", "/gpu:1", "/gpu:2", "/gpu:3"], "tf_session_args": {"device_count": {"GPU": 4}, "log_device_placement": false, "allow_soft_placement": true}, "timesteps_per_batch": 320000, "num_workers": 64}