Files
ray/python/ray/rllib/tuned_examples/pendulum-ddpg.yaml
T
andrewztan 1475600c81 [rllib] Merge DDPG and DDPG2 implementations (#2202)
* removed ddpg2

* removed ddpg2 from codebase

* added tests used in ddpg vs ddpg2 comparison

* added notes about training timesteps to yaml files

* removed ddpg2 yaml files

* removed unnecessary configs from yaml files

* removed unnecessary configs from yaml files

* moved pendulum, mountaincarcontinuous, and halfcheetah tests to tuned_examples

* moved pendulum, mountaincarcontinuous, and halfcheetah tests to tuned_examples

* added more configuration details to yaml files

* removed random starts from halfcheetah
2018-06-09 16:46:23 -07:00

68 lines
1.8 KiB
YAML

# This configuration can expect to reach -160 reward in 10k-20k timesteps
pendulum-ddpg:
env: Pendulum-v0
run: DDPG
stop:
episode_reward_mean: -160
time_total_s: 600 # 10 minutes
config:
# === Model ===
actor_hiddens: [64, 64]
critic_hiddens: [64, 64]
n_step: 1
model: {}
gamma: 0.99
env_config: {}
# === Exploration ===
schedule_max_timesteps: 100000
timesteps_per_iteration: 600
exploration_fraction: 0.1
exploration_final_eps: 0.02
noise_scale: 0.1
exploration_theta: 0.15
exploration_sigma: 0.2
target_network_update_freq: 0
tau: 0.001
# === Replay buffer ===
buffer_size: 10000
prioritized_replay: True
prioritized_replay_alpha: 0.6
prioritized_replay_beta: 0.4
prioritized_replay_eps: 0.000001
clip_rewards: False
# === Optimization ===
actor_lr: 0.0001
critic_lr: 0.001
use_huber: True
huber_threshold: 1.0
l2_reg: 0.000001
learning_starts: 500
sample_batch_size: 1
train_batch_size: 64
smoothing_num_episodes: 10
# === Tensorflow ===
tf_session_args: {
"device_count": {
"CPU": 2
},
"log_device_placement": False,
"allow_soft_placement": True,
"gpu_options": {
"allow_growth": True
},
"inter_op_parallelism_threads": 1,
"intra_op_parallelism_threads": 1,
}
# === Parallelism ===
num_workers: 0
num_gpus_per_worker: 0
optimizer_class: "LocalSyncReplayOptimizer"
optimizer_config: {}
per_worker_exploration: False
worker_side_prioritization: False