mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 20:53:14 +08:00
[RLlib] Examples folder restructuring (Model examples; final part). (#8278)
- This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward.
This commit is contained in:
+7
-7
@@ -219,14 +219,14 @@ matrix:
|
||||
- . ./ci/travis/ci.sh build
|
||||
script:
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_A,examples_B rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_L,examples_M,examples_N,examples_P rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_U,examples_R,examples_S,examples_T rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C,examples_D rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z rllib/...
|
||||
|
||||
# RLlib: tests_dir: Everything in rllib/tests/ directory (A-I).
|
||||
- os: linux
|
||||
env:
|
||||
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_I=1
|
||||
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_L=1
|
||||
- PYTHON=3.6
|
||||
- TF_VERSION=2.0.0b1
|
||||
- TFP_VERSION=0.8
|
||||
@@ -237,12 +237,12 @@ matrix:
|
||||
before_script:
|
||||
- . ./ci/travis/ci.sh build
|
||||
script:
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L rllib/...
|
||||
|
||||
# RLlib: tests_dir: Everything in rllib/tests/ directory (J-Z).
|
||||
- os: linux
|
||||
env:
|
||||
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_J_TO_Z=1
|
||||
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_M_TO_Z=1
|
||||
- PYTHON=3.6
|
||||
- TF_VERSION=2.0.0b1
|
||||
- TFP_VERSION=0.8
|
||||
@@ -253,7 +253,7 @@ matrix:
|
||||
before_script:
|
||||
- . ./ci/travis/ci.sh build
|
||||
script:
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_J,tests_dir_K,tests_dir_L,tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/...
|
||||
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/...
|
||||
|
||||
# Cpp worker test
|
||||
- os: linux
|
||||
|
||||
+378
-122
@@ -27,8 +27,8 @@
|
||||
# 2) everything in a) using tf1.x
|
||||
# 3) everything in b) c) d) and e)
|
||||
# 4) everything in g)
|
||||
# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_I`
|
||||
# 6) f), BUT only those tagged `tests_dir_J` to `tests_dir_Z`
|
||||
# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_L`
|
||||
# 6) f), BUT only those tagged `tests_dir_M` to `tests_dir_Z`
|
||||
|
||||
|
||||
# --------------------------------------------------------------------
|
||||
@@ -1453,95 +1453,215 @@ py_test(
|
||||
|
||||
|
||||
py_test(
|
||||
name = "examples/autoregressive_action_dist", main = "examples/autoregressive_action_dist.py",
|
||||
name = "examples/autoregressive_action_dist_tf",
|
||||
main = "examples/autoregressive_action_dist.py",
|
||||
tags = ["examples", "examples_A"],
|
||||
size = "large",
|
||||
size = "medium",
|
||||
srcs = ["examples/autoregressive_action_dist.py"],
|
||||
args = ["--stop=150", "--num-cpus=4"]
|
||||
args = ["--as-test", "--stop-reward=150", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/batch_norm_model_ppo", main="examples/batch_norm_model.py",
|
||||
name = "examples/autoregressive_action_dist_torch",
|
||||
main = "examples/autoregressive_action_dist.py",
|
||||
tags = ["examples", "examples_A"],
|
||||
size = "medium",
|
||||
srcs = ["examples/autoregressive_action_dist.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=150", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/batch_norm_model_ppo_tf",
|
||||
main = "examples/batch_norm_model.py",
|
||||
tags = ["examples", "examples_B"],
|
||||
size = "medium",
|
||||
srcs = ["examples/batch_norm_model.py"],
|
||||
args = ["--run=PPO", "--num-iters=1"]
|
||||
args = ["--as-test", "--run=PPO", "--stop-reward=80"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/batch_norm_model_pg", main="examples/batch_norm_model.py",
|
||||
name = "examples/batch_norm_model_ppo_torch",
|
||||
main = "examples/batch_norm_model.py",
|
||||
tags = ["examples", "examples_B"],
|
||||
size = "medium",
|
||||
srcs = ["examples/batch_norm_model.py"],
|
||||
args = ["--run=PG", "--num-iters=1"]
|
||||
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=80"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/batch_norm_model_dqn", main="examples/batch_norm_model.py",
|
||||
name = "examples/batch_norm_model_dqn_tf",
|
||||
main = "examples/batch_norm_model.py",
|
||||
tags = ["examples", "examples_B"],
|
||||
size = "medium",
|
||||
size = "medium", # DQN learns much slower with BatchNorm.
|
||||
srcs = ["examples/batch_norm_model.py"],
|
||||
args = ["--run=DQN", "--num-iters=1"]
|
||||
args = ["--as-test", "--run=DQN", "--stop-reward=70"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/batch_norm_model_ddpg", main="examples/batch_norm_model.py",
|
||||
name = "examples/batch_norm_model_dqn_torch",
|
||||
main = "examples/batch_norm_model.py",
|
||||
tags = ["examples", "examples_B"],
|
||||
size = "medium",
|
||||
size = "medium", # DQN learns much slower with BatchNorm.
|
||||
srcs = ["examples/batch_norm_model.py"],
|
||||
args = ["--run=DDPG", "--num-iters=1"]
|
||||
args = ["--as-test", "--torch", "--run=DQN", "--stop-reward=70"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/cartpole_lstm_impala", main="examples/cartpole_lstm.py",
|
||||
name = "examples/batch_norm_model_ddpg_tf",
|
||||
main = "examples/batch_norm_model.py",
|
||||
tags = ["examples", "examples_B"],
|
||||
size = "small",
|
||||
srcs = ["examples/batch_norm_model.py"],
|
||||
args = ["--run=DDPG", "--stop-iters=1"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/batch_norm_model_ddpg_torch",
|
||||
main = "examples/batch_norm_model.py",
|
||||
tags = ["examples", "examples_B"],
|
||||
size = "small",
|
||||
srcs = ["examples/batch_norm_model.py"],
|
||||
args = ["--torch", "--run=DDPG", "--stop-iters=1"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/cartpole_lstm_impala_tf",
|
||||
main = "examples/cartpole_lstm.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/cartpole_lstm.py"],
|
||||
args = ["--as-test", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/cartpole_lstm_impala_torch",
|
||||
main = "examples/cartpole_lstm.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/cartpole_lstm.py"],
|
||||
args = ["--run=IMPALA", "--stop=40", "--num-cpus=4"]
|
||||
args = ["--as-test", "--torch", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/cartpole_lstm_ppo", main="examples/cartpole_lstm.py",
|
||||
name = "examples/cartpole_lstm_ppo_tf",
|
||||
main = "examples/cartpole_lstm.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/cartpole_lstm.py"],
|
||||
args = ["--run=PPO", "--stop=40", "--num-cpus=4"]
|
||||
args = ["--as-test", "--run=PPO", "--stop-reward=40", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/cartpole_lstm_ppo_with_prev_a_and_r", main="examples/cartpole_lstm.py",
|
||||
name = "examples/cartpole_lstm_ppo_torch",
|
||||
main = "examples/cartpole_lstm.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "large",
|
||||
size = "small",
|
||||
srcs = ["examples/cartpole_lstm.py"],
|
||||
args = ["--run=PPO", "--stop=40", "--use-prev-action-reward", "--num-cpus=4"]
|
||||
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/centralized_critic",
|
||||
name = "examples/cartpole_lstm_ppo_tf_with_prev_a_and_r",
|
||||
main = "examples/cartpole_lstm.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/cartpole_lstm.py"],
|
||||
args = ["--as-test", "--run=PPO", "--stop-reward=40", "--use-prev-action-reward", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/centralized_critic_tf",
|
||||
main = "examples/centralized_critic.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/centralized_critic.py"],
|
||||
args = ["--stop=2000"]
|
||||
args = ["--as-test", "--stop-reward=7.2"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/centralized_critic_2",
|
||||
name = "examples/centralized_critic_torch",
|
||||
main = "examples/centralized_critic.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/centralized_critic.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=7.2"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/centralized_critic_2_tf",
|
||||
main = "examples/centralized_critic_2.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/centralized_critic_2.py"],
|
||||
args = ["--stop=2000"]
|
||||
args = ["--as-test", "--stop-reward=6.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_eval", main = "examples/custom_eval.py",
|
||||
name = "examples/centralized_critic_2_torch",
|
||||
main = "examples/centralized_critic_2.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_eval.py"],
|
||||
args = ["--custom-eval", "--num-cpus=4"]
|
||||
srcs = ["examples/centralized_critic_2.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=6.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_keras_model_a2c", main="examples/custom_keras_model.py",
|
||||
name = "examples/custom_env_tf",
|
||||
main = "examples/custom_env.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_env.py"],
|
||||
args = ["--as-test"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_env_torch",
|
||||
main = "examples/custom_env.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_env.py"],
|
||||
args = ["--as-test", "--torch"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_eval_tf",
|
||||
main = "examples/custom_eval.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_eval.py"],
|
||||
args = ["--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_eval_torch",
|
||||
main = "examples/custom_eval.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_eval.py"],
|
||||
args = ["--torch", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_fast_model_tf",
|
||||
main = "examples/custom_fast_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_fast_model.py"],
|
||||
args = ["--stop-iters=1", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_fast_model_torch",
|
||||
main = "examples/custom_fast_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_fast_model.py"],
|
||||
args = ["--torch", "--stop-iters=1", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_keras_model_a2c",
|
||||
main = "examples/custom_keras_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "large",
|
||||
srcs = ["examples/custom_keras_model.py"],
|
||||
@@ -1549,7 +1669,8 @@ py_test(
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_keras_model_dqn", main="examples/custom_keras_model.py",
|
||||
name = "examples/custom_keras_model_dqn",
|
||||
main = "examples/custom_keras_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_keras_model.py"],
|
||||
@@ -1557,7 +1678,8 @@ py_test(
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_keras_model_ppo", main="examples/custom_keras_model.py",
|
||||
name = "examples/custom_keras_model_ppo",
|
||||
main = "examples/custom_keras_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_keras_model.py"],
|
||||
@@ -1565,46 +1687,79 @@ py_test(
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_keras_rnn_model_repeat_after_me", main = "examples/custom_keras_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "large",
|
||||
srcs = ["examples/custom_keras_rnn_model.py"],
|
||||
args = ["--run=PPO", "--stop=50", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_keras_rnn_model_repeat_initial",
|
||||
main = "examples/custom_keras_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "large",
|
||||
srcs = ["examples/custom_keras_rnn_model.py"],
|
||||
args = ["--run=PPO", "--stop=50", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_loss",
|
||||
name = "examples/custom_loss_tf",
|
||||
main = "examples/custom_loss.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
# Include the json data file.
|
||||
data = glob(["tests/data/cartpole_small/**"]),
|
||||
srcs = ["examples/custom_loss.py"],
|
||||
args = ["--iters=2", "--input-files=tests/data/cartpole_small"]
|
||||
args = ["--stop-iters=2", "--input-files=tests/data/cartpole_small"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_loss_torch",
|
||||
main = "examples/custom_loss.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
# Include the json data file.
|
||||
data = glob(["tests/data/cartpole_small/**"]),
|
||||
srcs = ["examples/custom_loss.py"],
|
||||
args = ["--torch", "--stop-iters=2", "--input-files=tests/data/cartpole_small"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_metrics_and_callbacks",
|
||||
main = "examples/custom_metrics_and_callbacks.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_metrics_and_callbacks.py"],
|
||||
args = ["--num-iters=2"]
|
||||
args = ["--stop-iters=2"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_metrics_and_callbacks_legacy",
|
||||
main = "examples/custom_metrics_and_callbacks_legacy.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_metrics_and_callbacks_legacy.py"],
|
||||
args = ["--num-iters=2"]
|
||||
args = ["--stop-iters=2"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_rnn_model_repeat_after_me_tf",
|
||||
main = "examples/custom_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_rnn_model.py"],
|
||||
args = ["--as-test", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_rnn_model_repeat_initial_obs_tf",
|
||||
main = "examples/custom_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_rnn_model.py"],
|
||||
args = ["--as-test", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_rnn_model_repeat_after_me_torch",
|
||||
main = "examples/custom_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_rnn_model.py"],
|
||||
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_rnn_model_repeat_initial_obs_torch",
|
||||
main = "examples/custom_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_rnn_model.py"],
|
||||
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -1612,16 +1767,7 @@ py_test(
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_tf_policy.py"],
|
||||
args = ["--iters=2", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/custom_torch_rnn_model",
|
||||
main = "examples/custom_torch_rnn_model.py",
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "medium",
|
||||
srcs = ["examples/custom_torch_rnn_model.py"],
|
||||
args = ["--run=PPO", "--stop=90", "--num-cpus=4"]
|
||||
args = ["--stop-iters=2", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -1629,7 +1775,7 @@ py_test(
|
||||
tags = ["examples", "examples_C"],
|
||||
size = "small",
|
||||
srcs = ["examples/custom_torch_policy.py"],
|
||||
args = ["--iters=2", "--num-cpus=4"]
|
||||
args = ["--stop-iters=2", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -1637,90 +1783,151 @@ py_test(
|
||||
tags = ["examples", "examples_E"],
|
||||
size = "small",
|
||||
srcs = ["examples/eager_execution.py"],
|
||||
args = ["--iters=2"]
|
||||
args = ["--stop-iters=2"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/hierarchical_training_tf",
|
||||
main = "examples/hierarchical_training.py",
|
||||
tags = ["examples", "examples_H"],
|
||||
size = "small",
|
||||
size = "medium",
|
||||
srcs = ["examples/hierarchical_training.py"],
|
||||
args = ["--stop-reward=0.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/hierarchical_training_torch",
|
||||
main = "examples/hierarchical_training.py",
|
||||
tags = ["examples", "examples_H"],
|
||||
size = "small",
|
||||
size = "medium",
|
||||
srcs = ["examples/hierarchical_training.py"],
|
||||
args = ["--torch", "--stop-reward=0.0"]
|
||||
)
|
||||
|
||||
# Do not run this test (MobileNetV2 is gigantic and takes forever for 1 iter).
|
||||
# py_test(
|
||||
# name = "examples/mobilenet_v2_with_lstm_tf",
|
||||
# main = "examples/mobilenet_v2_with_lstm.py",
|
||||
# tags = ["examples", "examples_M"],
|
||||
# size = "small",
|
||||
# srcs = ["examples/mobilenet_v2_with_lstm.py"]
|
||||
# )
|
||||
|
||||
py_test(
|
||||
name = "examples/multi_agent_cartpole",
|
||||
name = "examples/multi_agent_cartpole_tf",
|
||||
main = "examples/multi_agent_cartpole.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "medium",
|
||||
srcs = ["examples/multi_agent_cartpole.py"],
|
||||
args = ["--num-iters=2", "--num-cpus=4"]
|
||||
args = ["--as-test", "--stop-reward=70.0", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/multi_agent_custom_policy",
|
||||
name = "examples/multi_agent_cartpole_torch",
|
||||
main = "examples/multi_agent_cartpole.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "medium",
|
||||
srcs = ["examples/multi_agent_custom_policy.py"],
|
||||
size = "small",
|
||||
srcs = ["examples/multi_agent_cartpole.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=70.0", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/multi_agent_two_trainers",
|
||||
name = "examples/multi_agent_custom_policy_tf",
|
||||
main = "examples/multi_agent_custom_policy.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "small",
|
||||
srcs = ["examples/multi_agent_custom_policy.py"],
|
||||
args = ["--as-test", "--stop-reward=80"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/multi_agent_custom_policy_torch",
|
||||
main = "examples/multi_agent_custom_policy.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "small",
|
||||
srcs = ["examples/multi_agent_custom_policy.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=80"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/multi_agent_two_trainers_tf",
|
||||
main = "examples/multi_agent_two_trainers.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "medium",
|
||||
srcs = ["examples/multi_agent_two_trainers.py"],
|
||||
args = ["--num-iters=2"]
|
||||
args = ["--as-test", "--stop-reward=70"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/two_trainer_workflow",
|
||||
tags = ["examples", "examples_T"],
|
||||
name = "examples/multi_agent_two_trainers_torch",
|
||||
main = "examples/multi_agent_two_trainers.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "medium",
|
||||
srcs = ["examples/two_trainer_workflow.py"],
|
||||
args = ["--num-iters=2"]
|
||||
srcs = ["examples/multi_agent_two_trainers.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=70"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/nested_action_spaces_ppo",
|
||||
name = "examples/multi_agent_two_trainers_mixed_torch_tf",
|
||||
main = "examples/multi_agent_two_trainers.py",
|
||||
tags = ["examples", "examples_M"],
|
||||
size = "small",
|
||||
srcs = ["examples/multi_agent_two_trainers.py"],
|
||||
args = ["--as-test", "--mixed-torch-tf", "--stop-reward=70"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/nested_action_spaces_ppo_tf",
|
||||
main = "examples/nested_action_spaces.py",
|
||||
tags = ["examples", "examples_N"],
|
||||
size = "medium",
|
||||
srcs = ["examples/nested_action_spaces.py"],
|
||||
args = ["--stop=-500", "--run=PPO"]
|
||||
args = ["--as-test", "--stop-reward=-600", "--run=PPO"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/parametric_actions_cartpole_pg",
|
||||
name = "examples/nested_action_spaces_ppo_torch",
|
||||
main = "examples/nested_action_spaces.py",
|
||||
tags = ["examples", "examples_N"],
|
||||
size = "medium",
|
||||
srcs = ["examples/nested_action_spaces.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=-600", "--run=PPO"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/parametric_actions_cartpole_pg_tf",
|
||||
main = "examples/parametric_actions_cartpole.py",
|
||||
tags = ["examples", "examples_P"],
|
||||
size = "medium",
|
||||
srcs = ["examples/parametric_actions_cartpole.py"],
|
||||
args = ["--run=PG", "--stop=50"]
|
||||
args = ["--as-test", "--stop-reward=60.0", "--run=PG"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/parametric_actions_cartpole_ppo",
|
||||
name = "examples/parametric_actions_cartpole_dqn_tf",
|
||||
main = "examples/parametric_actions_cartpole.py",
|
||||
tags = ["examples", "examples_P"],
|
||||
size = "medium",
|
||||
srcs = ["examples/parametric_actions_cartpole.py"],
|
||||
args = ["--run=PPO", "--stop=50"]
|
||||
args = ["--as-test", "--stop-reward=60.0", "--run=DQN"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/parametric_actions_cartpole_dqn",
|
||||
name = "examples/parametric_actions_cartpole_pg_torch",
|
||||
main = "examples/parametric_actions_cartpole.py",
|
||||
tags = ["examples", "examples_P"],
|
||||
size = "small",
|
||||
srcs = ["examples/parametric_actions_cartpole.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=PG"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/parametric_actions_cartpole_dqn_torch",
|
||||
main = "examples/parametric_actions_cartpole.py",
|
||||
tags = ["examples", "examples_P"],
|
||||
size = "medium",
|
||||
srcs = ["examples/parametric_actions_cartpole.py"],
|
||||
args = ["--run=DQN", "--stop=50"]
|
||||
args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=DQN"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -1731,9 +1938,27 @@ py_test(
|
||||
args = ["--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/rock_paper_scissors_multiagent_tf",
|
||||
main = "examples/rock_paper_scissors_multiagent.py",
|
||||
tags = ["examples", "examples_R"],
|
||||
size = "medium",
|
||||
srcs = ["examples/rock_paper_scissors_multiagent.py"],
|
||||
args = ["--as-test"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/rock_paper_scissors_multiagent_torch",
|
||||
main = "examples/rock_paper_scissors_multiagent.py",
|
||||
tags = ["examples", "examples_R"],
|
||||
size = "medium",
|
||||
srcs = ["examples/rock_paper_scissors_multiagent.py"],
|
||||
args = ["--as-test", "--torch"],
|
||||
)
|
||||
|
||||
sh_test(
|
||||
name = "examples/serving/test_local_inference",
|
||||
tags = ["examples", "examples_L", "exclusive"],
|
||||
tags = ["examples", "examples_S", "exclusive"],
|
||||
size = "medium",
|
||||
srcs = ["examples/serving/test_local_inference.sh"],
|
||||
data = glob(["examples/serving/*.py"]),
|
||||
@@ -1741,27 +1966,82 @@ sh_test(
|
||||
|
||||
sh_test(
|
||||
name = "examples/serving/test_remote_inference",
|
||||
tags = ["examples", "examples_R", "exclusive"],
|
||||
tags = ["examples", "examples_S", "exclusive"],
|
||||
size = "medium",
|
||||
srcs = ["examples/serving/test_remote_inference.sh"],
|
||||
data = glob(["examples/serving/*.py"]),
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/rock_paper_scissors_multiagent",
|
||||
main = "examples/rock_paper_scissors_multiagent.py",
|
||||
tags = ["examples", "examples_R"],
|
||||
size = "large",
|
||||
srcs = ["examples/rock_paper_scissors_multiagent.py"],
|
||||
args = ["--stop=200"],
|
||||
name = "examples/two_trainer_workflow_tf",
|
||||
main = "examples/two_trainer_workflow.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "small",
|
||||
srcs = ["examples/two_trainer_workflow.py"],
|
||||
args = ["--as-test", "--stop-reward=100.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_maddpg", main = "examples/twostep_game.py",
|
||||
name = "examples/two_trainer_workflow_torch",
|
||||
main = "examples/two_trainer_workflow.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "small",
|
||||
srcs = ["examples/two_trainer_workflow.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=100.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/two_trainer_workflow_mixed_torch_tf",
|
||||
main = "examples/two_trainer_workflow.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "small",
|
||||
srcs = ["examples/two_trainer_workflow.py"],
|
||||
args = ["--as-test", "--mixed-torch-tf", "--stop-reward=100.0"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_maddpg",
|
||||
main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "large",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--stop=2000", "--run=contrib/MADDPG"]
|
||||
args = ["--stop-timesteps=2000", "--run=contrib/MADDPG"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_pg_tf",
|
||||
main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--as-test", "--stop-reward=7", "--run=PG"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_pg_torch",
|
||||
main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--as-test", "--torch", "--stop-reward=7", "--run=PG"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_qmix",
|
||||
main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--stop-timesteps=2000", "--run=QMIX"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_apex_qmix",
|
||||
main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--stop-timesteps=2000", "--run=APEX_QMIX", "--num-cpus=4"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -1770,7 +2050,7 @@ py_test(
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "small",
|
||||
srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
|
||||
args = ["--stop-at-reward=10", "--run=contrib/LinTS"],
|
||||
args = ["--as-test", "--stop-reward=10", "--run=contrib/LinTS"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
@@ -1779,29 +2059,5 @@ py_test(
|
||||
tags = ["examples", "examples_U"],
|
||||
size = "small",
|
||||
srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
|
||||
args = ["--stop-at-reward=10", "--run=contrib/LinUCB"],
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_pg", main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--stop=2000", "--run=PG"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_qmix", main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--stop=2000", "--run=QMIX"]
|
||||
)
|
||||
|
||||
py_test(
|
||||
name = "examples/twostep_game_apex_qmix", main = "examples/twostep_game.py",
|
||||
tags = ["examples", "examples_T"],
|
||||
size = "medium",
|
||||
srcs = ["examples/twostep_game.py"],
|
||||
args = ["--stop=2000", "--run=APEX_QMIX", "--num-cpus=4"]
|
||||
args = ["--as-test", "--stop-reward=10", "--run=contrib/LinUCB"],
|
||||
)
|
||||
|
||||
@@ -59,8 +59,13 @@ def apply_grad_clipping(policy, optimizer, loss):
|
||||
info = {}
|
||||
if policy.config["grad_clip"]:
|
||||
for param_group in optimizer.param_groups:
|
||||
info["grad_gnorm"] = nn.utils.clip_grad_norm_(
|
||||
param_group["params"], policy.config["grad_clip"])
|
||||
# Make sure we only pass params with grad != None into torch
|
||||
# clip_grad_norm_. Would fail otherwise.
|
||||
params = list(
|
||||
filter(lambda p: p.grad is not None, param_group["params"]))
|
||||
if params:
|
||||
info["grad_gnorm"] = nn.utils.clip_grad_norm_(
|
||||
params, policy.config["grad_clip"])
|
||||
return info
|
||||
|
||||
|
||||
|
||||
@@ -45,9 +45,9 @@ class DDPGTorchModel(TorchModelV2, nn.Module):
|
||||
only defines the layers for the output heads. Those layers for
|
||||
forward() should be defined in subclasses of DDPGTorchModel.
|
||||
"""
|
||||
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
nn.Module.__init__(self)
|
||||
super(DDPGTorchModel, self).__init__(obs_space, action_space,
|
||||
num_outputs, model_config, name)
|
||||
|
||||
self.bounded = np.logical_and(action_space.bounded_above,
|
||||
action_space.bounded_below).any()
|
||||
@@ -58,7 +58,7 @@ class DDPGTorchModel(TorchModelV2, nn.Module):
|
||||
|
||||
# Build the policy network.
|
||||
self.policy_model = nn.Sequential()
|
||||
ins = obs_space.shape[-1]
|
||||
ins = num_outputs
|
||||
self.obs_ins = ins
|
||||
activation = get_activation_fn(
|
||||
actor_hidden_activation, framework="torch")
|
||||
|
||||
@@ -1,16 +1,20 @@
|
||||
"""A very simple contextual bandit example with 3 arms."""
|
||||
|
||||
import argparse
|
||||
import random
|
||||
import numpy as np
|
||||
import gym
|
||||
from gym.spaces import Discrete, Box
|
||||
import numpy as np
|
||||
import random
|
||||
|
||||
from ray import tune
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop-at-reward", type=float, default=10)
|
||||
parser.add_argument("--run", type=str, default="contrib/LinUCB")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=10.0)
|
||||
|
||||
|
||||
class SimpleContextualBandit(gym.Env):
|
||||
@@ -37,11 +41,18 @@ class SimpleContextualBandit(gym.Env):
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={
|
||||
"episode_reward_mean": args.stop_at_reward,
|
||||
},
|
||||
config={
|
||||
"env": SimpleContextualBandit,
|
||||
})
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
config = {
|
||||
"env": SimpleContextualBandit,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
@@ -10,187 +10,56 @@ pattern, and a custom action distribution class that leverages that model.
|
||||
This examples shows both.
|
||||
"""
|
||||
|
||||
from gym.spaces import Discrete, Tuple
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv
|
||||
from ray.rllib.examples.models.autoregressive_action_model import \
|
||||
AutoregressiveActionModel, TorchAutoregressiveActionModel
|
||||
from ray.rllib.examples.models.autoregressive_action_dist import \
|
||||
BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution
|
||||
from ray.rllib.models.tf.misc import normc_initializer
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO") # try PG, PPO, IMPALA
|
||||
parser.add_argument("--stop", type=int, default=200)
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
class BinaryAutoregressiveOutput(ActionDistribution):
|
||||
"""Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
|
||||
|
||||
@staticmethod
|
||||
def required_model_output_shape(self, model_config):
|
||||
return 16 # controls model output feature vector size
|
||||
|
||||
def deterministic_sample(self):
|
||||
# first, sample a1
|
||||
a1_dist = self._a1_distribution()
|
||||
a1 = a1_dist.deterministic_sample()
|
||||
|
||||
# sample a2 conditioned on a1
|
||||
a2_dist = self._a2_distribution(a1)
|
||||
a2 = a2_dist.deterministic_sample()
|
||||
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
|
||||
|
||||
# return the action tuple
|
||||
return (a1, a2)
|
||||
|
||||
def sample(self):
|
||||
# first, sample a1
|
||||
a1_dist = self._a1_distribution()
|
||||
a1 = a1_dist.sample()
|
||||
|
||||
# sample a2 conditioned on a1
|
||||
a2_dist = self._a2_distribution(a1)
|
||||
a2 = a2_dist.sample()
|
||||
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
|
||||
|
||||
# return the action tuple
|
||||
return (a1, a2)
|
||||
|
||||
def logp(self, actions):
|
||||
a1, a2 = actions[:, 0], actions[:, 1]
|
||||
a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
|
||||
a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec])
|
||||
return (
|
||||
Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2))
|
||||
|
||||
def sampled_action_logp(self):
|
||||
return tf.exp(self._action_logp)
|
||||
|
||||
def entropy(self):
|
||||
a1_dist = self._a1_distribution()
|
||||
a2_dist = self._a2_distribution(a1_dist.sample())
|
||||
return a1_dist.entropy() + a2_dist.entropy()
|
||||
|
||||
def kl(self, other):
|
||||
a1_dist = self._a1_distribution()
|
||||
a1_terms = a1_dist.kl(other._a1_distribution())
|
||||
|
||||
a1 = a1_dist.sample()
|
||||
a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1))
|
||||
return a1_terms + a2_terms
|
||||
|
||||
def _a1_distribution(self):
|
||||
BATCH = tf.shape(self.inputs)[0]
|
||||
a1_logits, _ = self.model.action_model(
|
||||
[self.inputs, tf.zeros((BATCH, 1))])
|
||||
a1_dist = Categorical(a1_logits)
|
||||
return a1_dist
|
||||
|
||||
def _a2_distribution(self, a1):
|
||||
a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
|
||||
_, a2_logits = self.model.action_model([self.inputs, a1_vec])
|
||||
a2_dist = Categorical(a2_logits)
|
||||
return a2_dist
|
||||
|
||||
|
||||
class AutoregressiveActionsModel(TFModelV2):
|
||||
"""Implements the `.action_model` branch required above."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super(AutoregressiveActionsModel, self).__init__(
|
||||
obs_space, action_space, num_outputs, model_config, name)
|
||||
if action_space != Tuple([Discrete(2), Discrete(2)]):
|
||||
raise ValueError(
|
||||
"This model only supports the [2, 2] action space")
|
||||
|
||||
# Inputs
|
||||
obs_input = tf.keras.layers.Input(
|
||||
shape=obs_space.shape, name="obs_input")
|
||||
a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input")
|
||||
ctx_input = tf.keras.layers.Input(
|
||||
shape=(num_outputs, ), name="ctx_input")
|
||||
|
||||
# Output of the model (normally 'logits', but for an autoregressive
|
||||
# dist this is more like a context/feature layer encoding the obs)
|
||||
context = tf.keras.layers.Dense(
|
||||
num_outputs,
|
||||
name="hidden",
|
||||
activation=tf.nn.tanh,
|
||||
kernel_initializer=normc_initializer(1.0))(obs_input)
|
||||
|
||||
# V(s)
|
||||
value_out = tf.keras.layers.Dense(
|
||||
1,
|
||||
name="value_out",
|
||||
activation=None,
|
||||
kernel_initializer=normc_initializer(0.01))(context)
|
||||
|
||||
# P(a1 | obs)
|
||||
a1_logits = tf.keras.layers.Dense(
|
||||
2,
|
||||
name="a1_logits",
|
||||
activation=None,
|
||||
kernel_initializer=normc_initializer(0.01))(ctx_input)
|
||||
|
||||
# P(a2 | a1)
|
||||
# --note: typically you'd want to implement P(a2 | a1, obs) as follows:
|
||||
# a2_context = tf.keras.layers.Concatenate(axis=1)(
|
||||
# [ctx_input, a1_input])
|
||||
a2_context = a1_input
|
||||
a2_hidden = tf.keras.layers.Dense(
|
||||
16,
|
||||
name="a2_hidden",
|
||||
activation=tf.nn.tanh,
|
||||
kernel_initializer=normc_initializer(1.0))(a2_context)
|
||||
a2_logits = tf.keras.layers.Dense(
|
||||
2,
|
||||
name="a2_logits",
|
||||
activation=None,
|
||||
kernel_initializer=normc_initializer(0.01))(a2_hidden)
|
||||
|
||||
# Base layers
|
||||
self.base_model = tf.keras.Model(obs_input, [context, value_out])
|
||||
self.register_variables(self.base_model.variables)
|
||||
self.base_model.summary()
|
||||
|
||||
# Autoregressive action sampler
|
||||
self.action_model = tf.keras.Model([ctx_input, a1_input],
|
||||
[a1_logits, a2_logits])
|
||||
self.action_model.summary()
|
||||
self.register_variables(self.action_model.variables)
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
context, self._value_out = self.base_model(input_dict["obs"])
|
||||
return context, state
|
||||
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=200)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
ModelCatalog.register_custom_model("autoregressive_model",
|
||||
AutoregressiveActionsModel)
|
||||
ModelCatalog.register_custom_action_dist("binary_autoreg_output",
|
||||
BinaryAutoregressiveOutput)
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={"episode_reward_mean": args.stop},
|
||||
config={
|
||||
"env": CorrelatedActionsEnv,
|
||||
"gamma": 0.5,
|
||||
"num_gpus": 0,
|
||||
"model": {
|
||||
"custom_model": "autoregressive_model",
|
||||
"custom_action_dist": "binary_autoreg_output",
|
||||
},
|
||||
})
|
||||
ModelCatalog.register_custom_model(
|
||||
"autoregressive_model", TorchAutoregressiveActionModel
|
||||
if args.torch else AutoregressiveActionModel)
|
||||
ModelCatalog.register_custom_action_dist(
|
||||
"binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
|
||||
if args.torch else BinaryAutoregressiveDistribution)
|
||||
|
||||
config = {
|
||||
"env": CorrelatedActionsEnv,
|
||||
"gamma": 0.5,
|
||||
"num_gpus": 0,
|
||||
"model": {
|
||||
"custom_model": "autoregressive_model",
|
||||
"custom_action_dist": "binary_autoreg_dist",
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, stop=stop, config=config)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
ray.shutdown()
|
||||
|
||||
@@ -4,148 +4,28 @@ import argparse
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.examples.models.batch_norm_model import BatchNormModel, \
|
||||
TorchBatchNormModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.misc import normc_initializer
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=200)
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
|
||||
|
||||
class BatchNormModel(TFModelV2):
|
||||
"""Example of a TFModelV2 that is built w/o using tf.keras.
|
||||
|
||||
NOTE: This example does not work when using a keras-based TFModelV2 due
|
||||
to a bug in keras related to missing values for input placeholders, even
|
||||
though these input values have been provided in a forward pass through the
|
||||
actual keras Model.
|
||||
|
||||
All Model logic (layers) is defined in the `forward` method (incl.
|
||||
the batch_normalization layers). Also, all variables are registered
|
||||
(only once) at the end of `forward`, so an optimizer knows which tensors
|
||||
to train on. A standard `value_function` override is used.
|
||||
"""
|
||||
capture_index = 0
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super().__init__(obs_space, action_space, num_outputs, model_config,
|
||||
name)
|
||||
# Have we registered our vars yet (see `forward`)?
|
||||
self._registered = False
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
last_layer = input_dict["obs"]
|
||||
hiddens = [256, 256]
|
||||
with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
|
||||
for i, size in enumerate(hiddens):
|
||||
last_layer = tf.layers.dense(
|
||||
last_layer,
|
||||
size,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
activation=tf.nn.tanh,
|
||||
name="fc{}".format(i))
|
||||
# Add a batch norm layer
|
||||
last_layer = tf.layers.batch_normalization(
|
||||
last_layer,
|
||||
training=input_dict["is_training"],
|
||||
name="bn_{}".format(i))
|
||||
|
||||
output = tf.layers.dense(
|
||||
last_layer,
|
||||
self.num_outputs,
|
||||
kernel_initializer=normc_initializer(0.01),
|
||||
activation=None,
|
||||
name="out")
|
||||
self._value_out = tf.layers.dense(
|
||||
last_layer,
|
||||
1,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
activation=None,
|
||||
name="vf")
|
||||
if not self._registered:
|
||||
self.register_variables(
|
||||
tf.get_collection(
|
||||
tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
|
||||
self._registered = True
|
||||
|
||||
return output, []
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
|
||||
class KerasBatchNormModel(TFModelV2):
|
||||
"""Keras version of above BatchNormModel with exactly the same structure.
|
||||
|
||||
IMORTANT NOTE: This model will not work with PPO due to a bug in keras
|
||||
that surfaces when having more than one input placeholder (here: `inputs`
|
||||
and `is_training`) AND using the `make_tf_callable` helper (e.g. used by
|
||||
PPO), in which auto-placeholders are generated, then passed through the
|
||||
tf.keras. models.Model. In this last step, the connection between 1) the
|
||||
provided value in the auto-placeholder and 2) the keras `is_training`
|
||||
Input is broken and keras complains.
|
||||
Use the above `BatchNormModel` (a non-keras based TFModelV2), instead.
|
||||
"""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super().__init__(obs_space, action_space, num_outputs, model_config,
|
||||
name)
|
||||
inputs = tf.keras.layers.Input(shape=obs_space.shape, name="inputs")
|
||||
is_training = tf.keras.layers.Input(
|
||||
shape=(), dtype=tf.bool, batch_size=1, name="is_training")
|
||||
last_layer = inputs
|
||||
hiddens = [256, 256]
|
||||
for i, size in enumerate(hiddens):
|
||||
label = "fc{}".format(i)
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
units=size,
|
||||
kernel_initializer=normc_initializer(1.0),
|
||||
activation=tf.nn.tanh,
|
||||
name=label)(last_layer)
|
||||
# Add a batch norm layer
|
||||
last_layer = tf.keras.layers.BatchNormalization()(
|
||||
last_layer, training=is_training[0])
|
||||
output = tf.keras.layers.Dense(
|
||||
units=self.num_outputs,
|
||||
kernel_initializer=normc_initializer(0.01),
|
||||
activation=None,
|
||||
name="fc_out")(last_layer)
|
||||
value_out = tf.keras.layers.Dense(
|
||||
units=1,
|
||||
kernel_initializer=normc_initializer(0.01),
|
||||
activation=None,
|
||||
name="value_out")(last_layer)
|
||||
|
||||
self.base_model = tf.keras.models.Model(
|
||||
inputs=[inputs, is_training], outputs=[output, value_out])
|
||||
self.register_variables(self.base_model.variables)
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
out, self._value_out = self.base_model(
|
||||
[input_dict["obs"], input_dict["is_training"]])
|
||||
return out, []
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=150)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init()
|
||||
ray.init(local_mode=True)
|
||||
|
||||
ModelCatalog.register_custom_model("bn_model", BatchNormModel)
|
||||
ModelCatalog.register_custom_model(
|
||||
"bn_model", TorchBatchNormModel if args.torch else BatchNormModel)
|
||||
|
||||
config = {
|
||||
"env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
|
||||
@@ -153,10 +33,18 @@ if __name__ == "__main__":
|
||||
"custom_model": "bn_model",
|
||||
},
|
||||
"num_workers": 0,
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={"training_iteration": args.num_iters},
|
||||
config=config,
|
||||
)
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, stop=stop, config=config)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -1,13 +1,17 @@
|
||||
import argparse
|
||||
|
||||
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=200)
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--use-prev-action-reward", action="store_true")
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--use-prev-action-reward", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=150.0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
import ray
|
||||
@@ -30,16 +34,24 @@ if __name__ == "__main__":
|
||||
},
|
||||
}
|
||||
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={"episode_reward_mean": args.stop},
|
||||
config=dict(
|
||||
configs[args.run], **{
|
||||
"env": StatelessCartPole,
|
||||
"model": {
|
||||
"use_lstm": True,
|
||||
"lstm_use_prev_action_reward": args.use_prev_action_reward,
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}),
|
||||
)
|
||||
config = dict(
|
||||
configs[args.run], **{
|
||||
"env": StatelessCartPole,
|
||||
"model": {
|
||||
"use_lstm": True,
|
||||
"lstm_use_prev_action_reward": args.use_prev_action_reward,
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
})
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
ray.shutdown()
|
||||
|
||||
@@ -16,77 +16,53 @@ import argparse
|
||||
import numpy as np
|
||||
from gym.spaces import Discrete
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.agents.ppo.ppo import PPOTrainer
|
||||
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, KLCoeffMixin, \
|
||||
PPOLoss
|
||||
PPOLoss as TFLoss
|
||||
from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy, \
|
||||
KLCoeffMixin as TorchKLCoeffMixin, PPOLoss as TorchLoss
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages, \
|
||||
Postprocessing
|
||||
from ray.rllib.examples.twostep_game import TwoStepGame
|
||||
from ray.rllib.examples.env.two_step_game import TwoStepGame
|
||||
from ray.rllib.examples.models.centralized_critic_models import \
|
||||
CentralizedCriticModel, TorchCentralizedCriticModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
|
||||
EntropyCoeffSchedule
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from ray.rllib.policy.torch_policy import LearningRateSchedule as TorchLR, \
|
||||
EntropyCoeffSchedule as TorchEntropyCoeffSchedule
|
||||
from ray.rllib.utils.explained_variance import explained_variance
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
from ray.rllib.utils.tf_ops import make_tf_callable
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.torch_ops import convert_to_torch_tensor
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
OPPONENT_OBS = "opponent_obs"
|
||||
OPPONENT_ACTION = "opponent_action"
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=100000)
|
||||
|
||||
|
||||
class CentralizedCriticModel(TFModelV2):
|
||||
"""Multi-agent model that implements a centralized VF."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super(CentralizedCriticModel, self).__init__(
|
||||
obs_space, action_space, num_outputs, model_config, name)
|
||||
# Base of the model
|
||||
self.model = FullyConnectedNetwork(obs_space, action_space,
|
||||
num_outputs, model_config, name)
|
||||
self.register_variables(self.model.variables())
|
||||
|
||||
# Central VF maps (obs, opp_obs, opp_act) -> vf_pred
|
||||
obs = tf.keras.layers.Input(shape=(6, ), name="obs")
|
||||
opp_obs = tf.keras.layers.Input(shape=(6, ), name="opp_obs")
|
||||
opp_act = tf.keras.layers.Input(shape=(2, ), name="opp_act")
|
||||
concat_obs = tf.keras.layers.Concatenate(axis=1)(
|
||||
[obs, opp_obs, opp_act])
|
||||
central_vf_dense = tf.keras.layers.Dense(
|
||||
16, activation=tf.nn.tanh, name="c_vf_dense")(concat_obs)
|
||||
central_vf_out = tf.keras.layers.Dense(
|
||||
1, activation=None, name="c_vf_out")(central_vf_dense)
|
||||
self.central_vf = tf.keras.Model(
|
||||
inputs=[obs, opp_obs, opp_act], outputs=central_vf_out)
|
||||
self.register_variables(self.central_vf.variables)
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
return self.model.forward(input_dict, state, seq_lens)
|
||||
|
||||
def central_value_function(self, obs, opponent_obs, opponent_actions):
|
||||
return tf.reshape(
|
||||
self.central_vf(
|
||||
[obs, opponent_obs,
|
||||
tf.one_hot(opponent_actions, 2)]), [-1])
|
||||
|
||||
def value_function(self):
|
||||
return self.model.value_function() # not used
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=100)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=7.99)
|
||||
|
||||
|
||||
class CentralizedValueMixin:
|
||||
"""Add method to evaluate the central value function from the model."""
|
||||
|
||||
def __init__(self):
|
||||
self.compute_central_vf = make_tf_callable(self.get_session())(
|
||||
self.model.central_value_function)
|
||||
if not self.config["use_pytorch"]:
|
||||
self.compute_central_vf = make_tf_callable(self.get_session())(
|
||||
self.model.central_value_function)
|
||||
else:
|
||||
self.compute_central_vf = self.model.central_value_function
|
||||
|
||||
|
||||
# Grabs the opponent obs/act and includes it in the experience train_batch,
|
||||
@@ -95,7 +71,9 @@ def centralized_critic_postprocessing(policy,
|
||||
sample_batch,
|
||||
other_agent_batches=None,
|
||||
episode=None):
|
||||
if policy.loss_initialized():
|
||||
pytorch = policy.config["use_pytorch"]
|
||||
if (pytorch and hasattr(policy, "compute_central_vf")) or \
|
||||
(not pytorch and policy.loss_initialized()):
|
||||
assert other_agent_batches is not None
|
||||
[(_, opponent_batch)] = list(other_agent_batches.values())
|
||||
|
||||
@@ -104,11 +82,18 @@ def centralized_critic_postprocessing(policy,
|
||||
sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]
|
||||
|
||||
# overwrite default VF prediction with the central VF
|
||||
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
|
||||
sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
|
||||
sample_batch[OPPONENT_ACTION])
|
||||
if args.torch:
|
||||
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
|
||||
convert_to_torch_tensor(sample_batch[SampleBatch.CUR_OBS]),
|
||||
convert_to_torch_tensor(sample_batch[OPPONENT_OBS]),
|
||||
convert_to_torch_tensor(sample_batch[OPPONENT_ACTION])). \
|
||||
detach().numpy()
|
||||
else:
|
||||
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
|
||||
sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
|
||||
sample_batch[OPPONENT_ACTION])
|
||||
else:
|
||||
# policy hasn't initialized yet, use zeros
|
||||
# Policy hasn't been initialized yet, use zeros.
|
||||
sample_batch[OPPONENT_OBS] = np.zeros_like(
|
||||
sample_batch[SampleBatch.CUR_OBS])
|
||||
sample_batch[OPPONENT_ACTION] = np.zeros_like(
|
||||
@@ -141,7 +126,13 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
|
||||
train_batch[SampleBatch.CUR_OBS], train_batch[OPPONENT_OBS],
|
||||
train_batch[OPPONENT_ACTION])
|
||||
|
||||
policy.loss_obj = PPOLoss(
|
||||
func = TFLoss if not policy.config["use_pytorch"] else TorchLoss
|
||||
adv = tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) \
|
||||
if not policy.config["use_pytorch"] else \
|
||||
torch.ones_like(train_batch[Postprocessing.ADVANTAGES],
|
||||
dtype=torch.bool)
|
||||
|
||||
policy.loss_obj = func(
|
||||
dist_class,
|
||||
model,
|
||||
train_batch[Postprocessing.VALUE_TARGETS],
|
||||
@@ -153,7 +144,7 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
|
||||
action_dist,
|
||||
policy.central_value_out,
|
||||
policy.kl_coeff,
|
||||
tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool),
|
||||
adv,
|
||||
entropy_coeff=policy.entropy_coeff,
|
||||
clip_param=policy.config["clip_param"],
|
||||
vf_clip_param=policy.config["vf_clip_param"],
|
||||
@@ -180,8 +171,8 @@ def central_vf_stats(policy, train_batch, grads):
|
||||
}
|
||||
|
||||
|
||||
CCPPO = PPOTFPolicy.with_updates(
|
||||
name="CCPPO",
|
||||
CCPPOTFPolicy = PPOTFPolicy.with_updates(
|
||||
name="CCPPOTFPolicy",
|
||||
postprocess_fn=centralized_critic_postprocessing,
|
||||
loss_fn=loss_with_central_critic,
|
||||
before_loss_init=setup_mixins,
|
||||
@@ -191,31 +182,64 @@ CCPPO = PPOTFPolicy.with_updates(
|
||||
CentralizedValueMixin
|
||||
])
|
||||
|
||||
CCPPOTorchPolicy = PPOTorchPolicy.with_updates(
|
||||
name="CCPPOTorchPolicy",
|
||||
postprocess_fn=centralized_critic_postprocessing,
|
||||
loss_fn=loss_with_central_critic,
|
||||
before_init=setup_mixins,
|
||||
mixins=[
|
||||
TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin,
|
||||
CentralizedValueMixin
|
||||
])
|
||||
|
||||
|
||||
def get_policy_class(config):
|
||||
return CCPPOTorchPolicy if config["use_pytorch"] else CCPPOTFPolicy
|
||||
|
||||
|
||||
CCTrainer = PPOTrainer.with_updates(
|
||||
name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None)
|
||||
name="CCPPOTrainer",
|
||||
default_policy=CCPPOTFPolicy,
|
||||
get_policy_class=get_policy_class,
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init(local_mode=True)
|
||||
args = parser.parse_args()
|
||||
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
|
||||
tune.run(
|
||||
CCTrainer,
|
||||
stop={
|
||||
"timesteps_total": args.stop,
|
||||
"episode_reward_mean": 7.99,
|
||||
|
||||
ModelCatalog.register_custom_model(
|
||||
"cc_model", TorchCentralizedCriticModel
|
||||
if args.torch else CentralizedCriticModel)
|
||||
|
||||
config = {
|
||||
"env": TwoStepGame,
|
||||
"batch_mode": "complete_episodes",
|
||||
"eager": False,
|
||||
"num_workers": 0,
|
||||
"multiagent": {
|
||||
"policies": {
|
||||
"pol1": (None, Discrete(6), TwoStepGame.action_space, {
|
||||
"use_pytorch": args.torch
|
||||
}),
|
||||
"pol2": (None, Discrete(6), TwoStepGame.action_space, {
|
||||
"use_pytorch": args.torch
|
||||
}),
|
||||
},
|
||||
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
|
||||
},
|
||||
config={
|
||||
"env": TwoStepGame,
|
||||
"batch_mode": "complete_episodes",
|
||||
"eager": False,
|
||||
"num_workers": 0,
|
||||
"multiagent": {
|
||||
"policies": {
|
||||
"pol1": (None, Discrete(6), TwoStepGame.action_space, {}),
|
||||
"pol2": (None, Discrete(6), TwoStepGame.action_space, {}),
|
||||
},
|
||||
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
|
||||
},
|
||||
"model": {
|
||||
"custom_model": "cc_model",
|
||||
},
|
||||
})
|
||||
"model": {
|
||||
"custom_model": "cc_model",
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(CCTrainer, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
@@ -10,64 +10,24 @@ modifies the policy to add a centralized value function.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
from gym.spaces import Box, Dict, Discrete
|
||||
from gym.spaces import Dict, Discrete
|
||||
import argparse
|
||||
|
||||
from ray import tune
|
||||
from ray.rllib.agents.callbacks import DefaultCallbacks
|
||||
from ray.rllib.examples.twostep_game import TwoStepGame
|
||||
from ray.rllib.examples.models.centralized_critic_models import \
|
||||
YetAnotherCentralizedCriticModel, YetAnotherTorchCentralizedCriticModel
|
||||
from ray.rllib.examples.env.two_step_game import TwoStepGame
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=100000)
|
||||
|
||||
|
||||
class CentralizedCriticModel(TFModelV2):
|
||||
"""Multi-agent model that implements a centralized VF.
|
||||
|
||||
It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the
|
||||
former of which can be used for computing actions (i.e., decentralized
|
||||
execution), and the latter for optimization (i.e., centralized learning).
|
||||
|
||||
This model has two parts:
|
||||
- An action model that looks at just 'own_obs' to compute actions
|
||||
- A value model that also looks at the 'opponent_obs' / 'opponent_action'
|
||||
to compute the value (it does this by using the 'obs_flat' tensor).
|
||||
"""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super(CentralizedCriticModel, self).__init__(
|
||||
obs_space, action_space, num_outputs, model_config, name)
|
||||
|
||||
self.action_model = FullyConnectedNetwork(
|
||||
Box(low=0, high=1, shape=(6, )), # one-hot encoded Discrete(6)
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name + "_action")
|
||||
self.register_variables(self.action_model.variables())
|
||||
|
||||
self.value_model = FullyConnectedNetwork(obs_space, action_space, 1,
|
||||
model_config, name + "_vf")
|
||||
self.register_variables(self.value_model.variables())
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
self._value_out, _ = self.value_model({
|
||||
"obs": input_dict["obs_flat"]
|
||||
}, state, seq_lens)
|
||||
return self.action_model({
|
||||
"obs": input_dict["obs"]["own_obs"]
|
||||
}, state, seq_lens)
|
||||
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=100)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=7.99)
|
||||
|
||||
|
||||
class FillInActions(DefaultCallbacks):
|
||||
@@ -109,7 +69,11 @@ def central_critic_observer(agent_obs, **kw):
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
|
||||
|
||||
ModelCatalog.register_custom_model(
|
||||
"cc_model", YetAnotherTorchCentralizedCriticModel
|
||||
if args.torch else YetAnotherCentralizedCriticModel)
|
||||
|
||||
action_space = Discrete(2)
|
||||
observer_space = Dict({
|
||||
"own_obs": Discrete(6),
|
||||
@@ -118,26 +82,33 @@ if __name__ == "__main__":
|
||||
"opponent_obs": Discrete(6),
|
||||
"opponent_action": Discrete(2),
|
||||
})
|
||||
tune.run(
|
||||
"PPO",
|
||||
stop={
|
||||
"timesteps_total": args.stop,
|
||||
"episode_reward_mean": 7.99,
|
||||
|
||||
config = {
|
||||
"env": TwoStepGame,
|
||||
"batch_mode": "complete_episodes",
|
||||
"callbacks": FillInActions,
|
||||
"num_workers": 0,
|
||||
"multiagent": {
|
||||
"policies": {
|
||||
"pol1": (None, observer_space, action_space, {}),
|
||||
"pol2": (None, observer_space, action_space, {}),
|
||||
},
|
||||
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
|
||||
"observation_fn": central_critic_observer,
|
||||
},
|
||||
config={
|
||||
"env": TwoStepGame,
|
||||
"batch_mode": "complete_episodes",
|
||||
"callbacks": FillInActions,
|
||||
"num_workers": 0,
|
||||
"multiagent": {
|
||||
"policies": {
|
||||
"pol1": (None, observer_space, action_space, {}),
|
||||
"pol2": (None, observer_space, action_space, {}),
|
||||
},
|
||||
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
|
||||
"observation_fn": central_critic_observer,
|
||||
},
|
||||
"model": {
|
||||
"custom_model": "cc_model",
|
||||
},
|
||||
})
|
||||
"model": {
|
||||
"custom_model": "cc_model",
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run("PPO", config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
@@ -7,20 +7,32 @@ This example shows:
|
||||
|
||||
You can visualize experiment results in ~/ray_results using TensorBoard.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
import gym
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from gym.spaces import Discrete, Box
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.tune import grid_search
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=50)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=0.1)
|
||||
|
||||
|
||||
class SimpleCorridor(gym.Env):
|
||||
@@ -46,11 +58,11 @@ class SimpleCorridor(gym.Env):
|
||||
elif action == 1:
|
||||
self.cur_pos += 1
|
||||
done = self.cur_pos >= self.end_pos
|
||||
return [self.cur_pos], 1 if done else 0, done, {}
|
||||
return [self.cur_pos], 1.0 if done else -0.1, done, {}
|
||||
|
||||
|
||||
class CustomModel(TFModelV2):
|
||||
"""Example of a custom model that just delegates to a fc-net."""
|
||||
"""Example of a keras custom model that just delegates to an fc-net."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
@@ -67,26 +79,58 @@ class CustomModel(TFModelV2):
|
||||
return self.model.value_function()
|
||||
|
||||
|
||||
class TorchCustomModel(TorchModelV2, nn.Module):
|
||||
"""Example of a PyTorch custom model that just delegates to a fc-net."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
nn.Module.__init__(self)
|
||||
|
||||
self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
input_dict["obs"] = input_dict["obs"].float()
|
||||
fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
|
||||
return fc_out, []
|
||||
|
||||
def value_function(self):
|
||||
return torch.reshape(self.torch_sub_model.value_function(), [-1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init()
|
||||
|
||||
# Can also register the env creator function explicitly with:
|
||||
# register_env("corridor", lambda config: SimpleCorridor(config))
|
||||
ray.init()
|
||||
ModelCatalog.register_custom_model("my_model", CustomModel)
|
||||
tune.run(
|
||||
"PPO",
|
||||
stop={
|
||||
"timesteps_total": 10000,
|
||||
ModelCatalog.register_custom_model(
|
||||
"my_model", TorchCustomModel if args.torch else CustomModel)
|
||||
|
||||
config = {
|
||||
"env": SimpleCorridor, # or "corridor" if registered above
|
||||
"env_config": {
|
||||
"corridor_length": 5,
|
||||
},
|
||||
config={
|
||||
"env": SimpleCorridor, # or "corridor" if registered above
|
||||
"model": {
|
||||
"custom_model": "my_model",
|
||||
},
|
||||
"vf_share_layers": True,
|
||||
"lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs
|
||||
"num_workers": 1, # parallelism
|
||||
"env_config": {
|
||||
"corridor_length": 5,
|
||||
},
|
||||
"model": {
|
||||
"custom_model": "my_model",
|
||||
},
|
||||
)
|
||||
"vf_share_layers": True,
|
||||
"lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs
|
||||
"num_workers": 1, # parallelism
|
||||
"use_pytorch": args.torch
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
ray.shutdown()
|
||||
|
||||
@@ -74,9 +74,9 @@ from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes
|
||||
from ray.rllib.examples.env.simple_corridor import SimpleCorridor
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--custom-eval", action="store_true")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--no-custom-eval", action="store_true")
|
||||
|
||||
|
||||
def custom_eval_function(trainer, eval_workers):
|
||||
@@ -124,48 +124,51 @@ def custom_eval_function(trainer, eval_workers):
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.custom_eval:
|
||||
eval_fn = custom_eval_function
|
||||
else:
|
||||
if args.no_custom_eval:
|
||||
eval_fn = None
|
||||
else:
|
||||
eval_fn = custom_eval_function
|
||||
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
|
||||
tune.run(
|
||||
"PG",
|
||||
stop={
|
||||
"training_iteration": 10,
|
||||
config = {
|
||||
"env": SimpleCorridor,
|
||||
"env_config": {
|
||||
"corridor_length": 10,
|
||||
},
|
||||
config={
|
||||
"env": SimpleCorridor,
|
||||
"horizon": 20,
|
||||
"log_level": "INFO",
|
||||
|
||||
# Training rollouts will be collected using just the learner
|
||||
# process, but evaluation will be done in parallel with two
|
||||
# workers. Hence, this run will use 3 CPUs total (1 for the
|
||||
# learner + 2 more for evaluation workers).
|
||||
"num_workers": 0,
|
||||
"evaluation_num_workers": 2,
|
||||
|
||||
# Optional custom eval function.
|
||||
"custom_eval_function": eval_fn,
|
||||
|
||||
# Enable evaluation, once per training iteration.
|
||||
"evaluation_interval": 1,
|
||||
|
||||
# Run 10 episodes each time evaluation runs.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# Override the env config for evaluation.
|
||||
"evaluation_config": {
|
||||
"env_config": {
|
||||
"corridor_length": 10,
|
||||
# Evaluate using LONGER corridor than trained on.
|
||||
"corridor_length": 5,
|
||||
},
|
||||
"horizon": 20,
|
||||
"log_level": "INFO",
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
# Training rollouts will be collected using just the learner
|
||||
# process, but evaluation will be done in parallel with two
|
||||
# workers. Hence, this run will use 3 CPUs total (1 for the
|
||||
# learner + 2 more for evaluation workers).
|
||||
"num_workers": 0,
|
||||
"evaluation_num_workers": 2,
|
||||
stop = {
|
||||
"training_iteration": 10,
|
||||
}
|
||||
|
||||
# Optional custom eval function.
|
||||
"custom_eval_function": eval_fn,
|
||||
tune.run("PG", config=config, stop=stop)
|
||||
|
||||
# Enable evaluation, once per training iteration.
|
||||
"evaluation_interval": 1,
|
||||
|
||||
# Run 10 episodes each time evaluation runs.
|
||||
"evaluation_num_episodes": 10,
|
||||
|
||||
# Override the env config for evaluation.
|
||||
"evaluation_config": {
|
||||
"env_config": {
|
||||
# Evaluate using LONGER corridor than trained on.
|
||||
"corridor_length": 5,
|
||||
},
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
})
|
||||
ray.shutdown()
|
||||
|
||||
@@ -4,48 +4,52 @@ Both the model and env are trivial (and super-fast), so they are useful
|
||||
for running perf microbenchmarks.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
import ray.tune as tune
|
||||
from ray.tune import sample_from
|
||||
from ray.rllib.examples.env.fast_image_env import FastImageEnv
|
||||
from ray.rllib.models import Model, ModelCatalog
|
||||
from ray.tune import run_experiments, sample_from
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
|
||||
class FastModel(Model):
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
bias = tf.get_variable(
|
||||
dtype=tf.float32,
|
||||
name="bias",
|
||||
initializer=tf.zeros_initializer,
|
||||
shape=())
|
||||
output = bias + tf.zeros([tf.shape(input_dict["obs"])[0], num_outputs])
|
||||
return output, output
|
||||
from ray.rllib.examples.models.fast_model import FastModel, TorchFastModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-cpus", type=int, default=2)
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init()
|
||||
ModelCatalog.register_custom_model("fast_model", FastModel)
|
||||
run_experiments({
|
||||
"demo": {
|
||||
"run": "IMPALA",
|
||||
"env": FastImageEnv,
|
||||
"config": {
|
||||
"compress_observations": True,
|
||||
"model": {
|
||||
"custom_model": "fast_model"
|
||||
},
|
||||
"num_gpus": 0,
|
||||
"num_workers": 2,
|
||||
"num_envs_per_worker": 10,
|
||||
"num_data_loader_buffers": 1,
|
||||
"num_aggregation_workers": 1,
|
||||
"broadcast_interval": 50,
|
||||
"rollout_fragment_length": 100,
|
||||
"train_batch_size": sample_from(
|
||||
lambda spec: 1000 * max(1, spec.config.num_gpus)),
|
||||
"fake_sampler": True,
|
||||
},
|
||||
args = parser.parse_args()
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
|
||||
ModelCatalog.register_custom_model(
|
||||
"fast_model", TorchFastModel if args.torch else FastModel)
|
||||
|
||||
config = {
|
||||
"env": FastImageEnv,
|
||||
"compress_observations": True,
|
||||
"model": {
|
||||
"custom_model": "fast_model"
|
||||
},
|
||||
})
|
||||
"num_gpus": 0,
|
||||
"num_workers": 2,
|
||||
"num_envs_per_worker": 10,
|
||||
"num_data_loader_buffers": 1,
|
||||
"num_aggregation_workers": 1,
|
||||
"broadcast_interval": 50,
|
||||
"rollout_fragment_length": 100,
|
||||
"train_batch_size": sample_from(
|
||||
lambda spec: 1000 * max(1, spec.config.num_gpus)),
|
||||
"fake_sampler": True,
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
}
|
||||
|
||||
tune.run("IMPALA", config=config, stop=stop)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -1,115 +0,0 @@
|
||||
# Explains/tests Issues:
|
||||
# https://github.com/ray-project/ray/issues/6928
|
||||
# https://github.com/ray-project/ray/issues/6732
|
||||
|
||||
from gym.spaces import Discrete, Box
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.agents.ppo import PPOTrainer
|
||||
from ray.rllib.examples.random_env import RandomEnv
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.annotations import override
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
cnn_shape = (4, 4, 3)
|
||||
|
||||
|
||||
class CustomModel(RecurrentTFModelV2):
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super(CustomModel, self).__init__(obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
|
||||
self.cell_size = 16
|
||||
visual_size = cnn_shape[0] * cnn_shape[1] * cnn_shape[2]
|
||||
|
||||
state_in_h = tf.keras.layers.Input(shape=(self.cell_size, ), name="h")
|
||||
state_in_c = tf.keras.layers.Input(shape=(self.cell_size, ), name="c")
|
||||
seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
|
||||
|
||||
inputs = tf.keras.layers.Input(
|
||||
shape=(None, visual_size), name="visual_inputs")
|
||||
|
||||
input_visual = inputs
|
||||
input_visual = tf.reshape(
|
||||
input_visual, [-1, cnn_shape[0], cnn_shape[1], cnn_shape[2]])
|
||||
cnn_input = tf.keras.layers.Input(shape=cnn_shape, name="cnn_input")
|
||||
|
||||
cnn_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
|
||||
alpha=1.0,
|
||||
include_top=True,
|
||||
weights=None,
|
||||
input_tensor=cnn_input,
|
||||
pooling=None)
|
||||
vision_out = cnn_model(input_visual)
|
||||
vision_out = tf.reshape(
|
||||
vision_out,
|
||||
[-1, tf.shape(inputs)[1],
|
||||
vision_out.shape.as_list()[-1]])
|
||||
|
||||
lstm_out, state_h, state_c = tf.keras.layers.LSTM(
|
||||
self.cell_size,
|
||||
return_sequences=True,
|
||||
return_state=True,
|
||||
name="lstm")(
|
||||
inputs=vision_out,
|
||||
mask=tf.sequence_mask(seq_in),
|
||||
initial_state=[state_in_h, state_in_c])
|
||||
|
||||
# Postprocess LSTM output with another hidden layer and compute values.
|
||||
logits = tf.keras.layers.Dense(
|
||||
self.num_outputs,
|
||||
activation=tf.keras.activations.linear,
|
||||
name="logits")(lstm_out)
|
||||
values = tf.keras.layers.Dense(
|
||||
1, activation=None, name="values")(lstm_out)
|
||||
|
||||
# Create the RNN model
|
||||
self.rnn_model = tf.keras.Model(
|
||||
inputs=[inputs, seq_in, state_in_h, state_in_c],
|
||||
outputs=[logits, values, state_h, state_c])
|
||||
self.register_variables(self.rnn_model.variables)
|
||||
self.rnn_model.summary()
|
||||
|
||||
@override(RecurrentTFModelV2)
|
||||
def forward_rnn(self, inputs, state, seq_lens):
|
||||
model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
|
||||
state)
|
||||
return model_out, [h, c]
|
||||
|
||||
@override(ModelV2)
|
||||
def get_initial_state(self):
|
||||
return [
|
||||
np.zeros(self.cell_size, np.float32),
|
||||
np.zeros(self.cell_size, np.float32),
|
||||
]
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ModelCatalog.register_custom_model("my_model", CustomModel)
|
||||
trainer = PPOTrainer(
|
||||
env=RandomEnv,
|
||||
config={
|
||||
# "eager": True, # <- should work for both eager or not
|
||||
"model": {
|
||||
"custom_model": "my_model",
|
||||
"max_seq_len": 20,
|
||||
},
|
||||
"vf_share_layers": True,
|
||||
"num_workers": 0, # no parallelism
|
||||
"env_config": {
|
||||
"action_space": Discrete(2),
|
||||
# Test a simple Tuple observation space.
|
||||
"observation_space": Box(
|
||||
0.0, 1.0, shape=cnn_shape, dtype=np.float32)
|
||||
}
|
||||
})
|
||||
trainer.train()
|
||||
@@ -17,7 +17,7 @@ tf = try_import_tf()
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="DQN") # Try PG, PPO, DQN
|
||||
parser.add_argument("--stop", type=int, default=200)
|
||||
parser.add_argument("--use_vision_network", action="store_true")
|
||||
parser.add_argument("--use-vision-network", action="store_true")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
|
||||
@@ -1,117 +0,0 @@
|
||||
"""Example of using a custom RNN keras model."""
|
||||
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
|
||||
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
|
||||
parser.add_argument("--stop", type=int, default=90)
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
class MyKerasRNN(RecurrentTFModelV2):
|
||||
"""Example of using the Keras functional API to define a RNN model."""
|
||||
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
hiddens_size=256,
|
||||
cell_size=64):
|
||||
super(MyKerasRNN, self).__init__(obs_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
self.cell_size = cell_size
|
||||
|
||||
# Define input layers
|
||||
input_layer = tf.keras.layers.Input(
|
||||
shape=(None, obs_space.shape[0]), name="inputs")
|
||||
state_in_h = tf.keras.layers.Input(shape=(cell_size, ), name="h")
|
||||
state_in_c = tf.keras.layers.Input(shape=(cell_size, ), name="c")
|
||||
seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
|
||||
|
||||
# Preprocess observation with a hidden layer and send to LSTM cell
|
||||
dense1 = tf.keras.layers.Dense(
|
||||
hiddens_size, activation=tf.nn.relu, name="dense1")(input_layer)
|
||||
lstm_out, state_h, state_c = tf.keras.layers.LSTM(
|
||||
cell_size, return_sequences=True, return_state=True, name="lstm")(
|
||||
inputs=dense1,
|
||||
mask=tf.sequence_mask(seq_in),
|
||||
initial_state=[state_in_h, state_in_c])
|
||||
|
||||
# Postprocess LSTM output with another hidden layer and compute values
|
||||
logits = tf.keras.layers.Dense(
|
||||
self.num_outputs,
|
||||
activation=tf.keras.activations.linear,
|
||||
name="logits")(lstm_out)
|
||||
values = tf.keras.layers.Dense(
|
||||
1, activation=None, name="values")(lstm_out)
|
||||
|
||||
# Create the RNN model
|
||||
self.rnn_model = tf.keras.Model(
|
||||
inputs=[input_layer, seq_in, state_in_h, state_in_c],
|
||||
outputs=[logits, values, state_h, state_c])
|
||||
self.register_variables(self.rnn_model.variables)
|
||||
self.rnn_model.summary()
|
||||
|
||||
@override(RecurrentTFModelV2)
|
||||
def forward_rnn(self, inputs, state, seq_lens):
|
||||
model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
|
||||
state)
|
||||
return model_out, [h, c]
|
||||
|
||||
@override(ModelV2)
|
||||
def get_initial_state(self):
|
||||
return [
|
||||
np.zeros(self.cell_size, np.float32),
|
||||
np.zeros(self.cell_size, np.float32),
|
||||
]
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
ModelCatalog.register_custom_model("rnn", MyKerasRNN)
|
||||
register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
|
||||
register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())
|
||||
|
||||
config = {
|
||||
"env": args.env,
|
||||
"env_config": {
|
||||
"repeat_delay": 2,
|
||||
},
|
||||
"gamma": 0.9,
|
||||
"num_workers": 0,
|
||||
"num_envs_per_worker": 20,
|
||||
"entropy_coeff": 0.001,
|
||||
"num_sgd_iter": 5,
|
||||
"vf_loss_coeff": 1e-5,
|
||||
"model": {
|
||||
"custom_model": "rnn",
|
||||
"max_seq_len": 20,
|
||||
},
|
||||
}
|
||||
tune.run(
|
||||
args.run,
|
||||
config=config,
|
||||
stop={"episode_reward_mean": args.stop},
|
||||
)
|
||||
@@ -16,17 +16,16 @@ import os
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.models import Model, ModelCatalog
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork
|
||||
from ray.rllib.models.model import restore_original_dimensions
|
||||
from ray.rllib.offline import JsonReader
|
||||
from ray.rllib.examples.models.custom_loss_model import CustomLossModel, \
|
||||
TorchCustomLossModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--iters", type=int, default=200)
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument(
|
||||
"--input-files",
|
||||
type=str,
|
||||
@@ -34,50 +33,6 @@ parser.add_argument(
|
||||
os.path.dirname(os.path.abspath(__file__)),
|
||||
"../tests/data/cartpole_small"))
|
||||
|
||||
|
||||
class CustomLossModel(Model):
|
||||
"""Custom model that adds an imitation loss on top of the policy loss."""
|
||||
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
self.obs_in = input_dict["obs"]
|
||||
with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
|
||||
self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
|
||||
self.action_space, num_outputs,
|
||||
options)
|
||||
return self.fcnet.outputs, self.fcnet.last_layer
|
||||
|
||||
def custom_loss(self, policy_loss, loss_inputs):
|
||||
# create a new input reader per worker
|
||||
reader = JsonReader(self.options["custom_options"]["input_files"])
|
||||
input_ops = reader.tf_input_ops()
|
||||
|
||||
# define a secondary loss by building a graph copy with weight sharing
|
||||
obs = tf.cast(input_ops["obs"], tf.float32)
|
||||
logits, _ = self._build_layers_v2({
|
||||
"obs": restore_original_dimensions(obs, self.obs_space)
|
||||
}, self.num_outputs, self.options)
|
||||
|
||||
# You can also add self-supervised losses easily by referencing tensors
|
||||
# created during _build_layers_v2(). For example, an autoencoder-style
|
||||
# loss can be added as follows:
|
||||
# ae_loss = squared_diff(
|
||||
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
|
||||
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
|
||||
|
||||
# compute the IL loss
|
||||
action_dist = Categorical(logits, self.options)
|
||||
self.policy_loss = policy_loss
|
||||
self.imitation_loss = tf.reduce_mean(
|
||||
-action_dist.logp(input_ops["actions"]))
|
||||
return policy_loss + 10 * self.imitation_loss
|
||||
|
||||
def custom_stats(self):
|
||||
return {
|
||||
"policy_loss": self.policy_loss,
|
||||
"imitation_loss": self.imitation_loss,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ray.init()
|
||||
args = parser.parse_args()
|
||||
@@ -90,20 +45,23 @@ if __name__ == "__main__":
|
||||
input_dir = rllib_dir.absolute().joinpath(args.input_files)
|
||||
args.input_files = str(input_dir)
|
||||
|
||||
ModelCatalog.register_custom_model("custom_loss", CustomLossModel)
|
||||
tune.run(
|
||||
"PG",
|
||||
stop={
|
||||
"training_iteration": args.iters,
|
||||
},
|
||||
config={
|
||||
"env": "CartPole-v0",
|
||||
"num_workers": 0,
|
||||
"model": {
|
||||
"custom_model": "custom_loss",
|
||||
"custom_options": {
|
||||
"input_files": args.input_files,
|
||||
},
|
||||
ModelCatalog.register_custom_model(
|
||||
"custom_loss", TorchCustomLossModel if args.torch else CustomLossModel)
|
||||
|
||||
config = {
|
||||
"env": "CartPole-v0",
|
||||
"num_workers": 0,
|
||||
"model": {
|
||||
"custom_model": "custom_loss",
|
||||
"custom_options": {
|
||||
"input_files": args.input_files,
|
||||
},
|
||||
},
|
||||
)
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
}
|
||||
|
||||
tune.run("PG", config=config, stop=stop)
|
||||
|
||||
@@ -64,14 +64,14 @@ class MyCallbacks(DefaultCallbacks):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=2000)
|
||||
parser.add_argument("--stop-iters", type=int, default=2000)
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init()
|
||||
trials = tune.run(
|
||||
"PG",
|
||||
stop={
|
||||
"training_iteration": args.num_iters,
|
||||
"training_iteration": args.stop_iters,
|
||||
},
|
||||
config={
|
||||
"env": "CartPole-v0",
|
||||
|
||||
@@ -53,14 +53,14 @@ def on_postprocess_traj(info):
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=2000)
|
||||
parser.add_argument("--stop-iters", type=int, default=2000)
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init()
|
||||
trials = tune.run(
|
||||
"PG",
|
||||
stop={
|
||||
"training_iteration": args.num_iters,
|
||||
"training_iteration": args.stop_iters,
|
||||
},
|
||||
config={
|
||||
"env": "CartPole-v0",
|
||||
|
||||
@@ -0,0 +1,62 @@
|
||||
"""Example of using a custom RNN keras model."""
|
||||
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
|
||||
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
|
||||
from ray.rllib.examples.models.rnn_model import RNNModel, TorchRNNModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop-reward", type=float, default=90)
|
||||
parser.add_argument("--stop-iters", type=int, default=100)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
|
||||
ModelCatalog.register_custom_model(
|
||||
"rnn", TorchRNNModel if args.torch else RNNModel)
|
||||
register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
|
||||
register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())
|
||||
|
||||
config = {
|
||||
"env": args.env,
|
||||
"env_config": {
|
||||
"repeat_delay": 2,
|
||||
},
|
||||
"gamma": 0.9,
|
||||
"num_workers": 0,
|
||||
"num_envs_per_worker": 20,
|
||||
"entropy_coeff": 0.001,
|
||||
"num_sgd_iter": 5,
|
||||
"vf_loss_coeff": 1e-5,
|
||||
"model": {
|
||||
"custom_model": "rnn",
|
||||
"max_seq_len": 20,
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
ray.shutdown()
|
||||
@@ -10,7 +10,7 @@ from ray.rllib.utils import try_import_tf
|
||||
tf = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--iters", type=int, default=200)
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ if __name__ == "__main__":
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
tune.run(
|
||||
MyTrainer,
|
||||
stop={"training_iteration": args.iters},
|
||||
stop={"training_iteration": args.stop_iters},
|
||||
config={
|
||||
"env": "CartPole-v0",
|
||||
"num_workers": 2,
|
||||
|
||||
@@ -7,7 +7,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.torch_policy_template import build_torch_policy
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--iters", type=int, default=200)
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ if __name__ == "__main__":
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
tune.run(
|
||||
MyTrainer,
|
||||
stop={"training_iteration": args.iters},
|
||||
stop={"training_iteration": args.stop_iters},
|
||||
config={
|
||||
"env": "CartPole-v0",
|
||||
"num_workers": 2,
|
||||
|
||||
@@ -1,128 +0,0 @@
|
||||
import argparse
|
||||
|
||||
import ray
|
||||
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
|
||||
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
|
||||
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
|
||||
from ray.rllib.models.preprocessors import get_preprocessor
|
||||
from ray.rllib.models.torch.recurrent_torch_model import RecurrentTorchModel
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils import try_import_torch
|
||||
from ray.rllib.models import ModelCatalog
|
||||
import ray.tune as tune
|
||||
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--env", type=str, default="repeat_initial")
|
||||
parser.add_argument("--stop", type=int, default=90)
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
parser.add_argument("--fc-size", type=int, default=64)
|
||||
parser.add_argument("--lstm-cell-size", type=int, default=256)
|
||||
|
||||
|
||||
class RNNModel(RecurrentTorchModel):
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
fc_size=64,
|
||||
lstm_state_size=256):
|
||||
super().__init__(obs_space, action_space, num_outputs, model_config,
|
||||
name)
|
||||
|
||||
self.obs_size = get_preprocessor(obs_space)(obs_space).size
|
||||
self.fc_size = fc_size
|
||||
self.lstm_state_size = lstm_state_size
|
||||
|
||||
# Build the Module from fc + LSTM + 2xfc (action + value outs).
|
||||
self.fc1 = nn.Linear(self.obs_size, self.fc_size)
|
||||
self.lstm = nn.LSTM(
|
||||
self.fc_size, self.lstm_state_size, batch_first=True)
|
||||
self.action_branch = nn.Linear(self.lstm_state_size, num_outputs)
|
||||
self.value_branch = nn.Linear(self.lstm_state_size, 1)
|
||||
# Store the value output to save an extra forward pass.
|
||||
self._cur_value = None
|
||||
|
||||
@override(ModelV2)
|
||||
def get_initial_state(self):
|
||||
# make hidden states on same device as model
|
||||
h = [
|
||||
self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0),
|
||||
self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0)
|
||||
]
|
||||
return h
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
assert self._cur_value is not None, "must call forward() first"
|
||||
return self._cur_value
|
||||
|
||||
@override(RecurrentTorchModel)
|
||||
def forward_rnn(self, inputs, state, seq_lens):
|
||||
"""Feeds `inputs` (B x T x ..) through the Gru Unit.
|
||||
|
||||
Returns the resulting outputs as a sequence (B x T x ...).
|
||||
Values are stored in self._cur_value in simple (B) shape (where B
|
||||
contains both the B and T dims!).
|
||||
|
||||
Returns:
|
||||
NN Outputs (B x T x ...) as sequence.
|
||||
The state batches as a List of two items (c- and h-states).
|
||||
"""
|
||||
x = nn.functional.relu(self.fc1(inputs))
|
||||
lstm_out = self.lstm(
|
||||
x, [torch.unsqueeze(state[0], 0),
|
||||
torch.unsqueeze(state[1], 0)])
|
||||
action_out = self.action_branch(lstm_out[0])
|
||||
self._cur_value = torch.reshape(self.value_branch(lstm_out[0]), [-1])
|
||||
return action_out, [
|
||||
torch.squeeze(lstm_out[1][0], 0),
|
||||
torch.squeeze(lstm_out[1][1], 0)
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
ModelCatalog.register_custom_model("rnn", RNNModel)
|
||||
tune.register_env(
|
||||
"repeat_initial", lambda _: RepeatInitialObsEnv(episode_len=100))
|
||||
tune.register_env(
|
||||
"repeat_after_me", lambda _: RepeatAfterMeEnv({"repeat_delay": 1}))
|
||||
tune.register_env("stateless_cartpole", lambda _: StatelessCartPole())
|
||||
|
||||
config = {
|
||||
"env": args.env,
|
||||
"use_pytorch": True,
|
||||
"num_workers": 0,
|
||||
"num_envs_per_worker": 20,
|
||||
"gamma": 0.9,
|
||||
"entropy_coeff": 0.0001,
|
||||
"model": {
|
||||
"custom_model": "rnn",
|
||||
"max_seq_len": 20,
|
||||
"lstm_use_prev_action_reward": "store_true",
|
||||
"custom_options": {
|
||||
"fc_size": args.fc_size,
|
||||
"lstm_state_size": args.lstm_cell_size,
|
||||
}
|
||||
},
|
||||
"lr": 3e-4,
|
||||
"num_sgd_iter": 5,
|
||||
"vf_loss_coeff": 0.0003,
|
||||
}
|
||||
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={
|
||||
"episode_reward_mean": args.stop,
|
||||
"timesteps_total": 100000
|
||||
},
|
||||
config=config,
|
||||
)
|
||||
@@ -4,70 +4,20 @@ import random
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.examples.models.eager_model import EagerModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tf_policy_template import build_tf_policy
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--iters", type=int, default=200)
|
||||
|
||||
|
||||
class EagerModel(TFModelV2):
|
||||
"""Example of using embedded eager execution in a custom model.
|
||||
|
||||
This shows how to use tf.py_function() to execute a snippet of TF code
|
||||
in eager mode. Here the `self.forward_eager` method just prints out
|
||||
the intermediate tensor for debug purposes, but you can in general
|
||||
perform any TF eager operation in tf.py_function().
|
||||
"""
|
||||
|
||||
def __init__(self, observation_space, action_space, num_outputs,
|
||||
model_config, name):
|
||||
super().__init__(observation_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
|
||||
inputs = tf.keras.layers.Input(shape=observation_space.shape)
|
||||
self.fcnet = FullyConnectedNetwork(
|
||||
obs_space=self.obs_space,
|
||||
action_space=self.action_space,
|
||||
num_outputs=self.num_outputs,
|
||||
model_config=self.model_config,
|
||||
name="fc1")
|
||||
out, value_out = self.fcnet.base_model(inputs)
|
||||
|
||||
def lambda_(x):
|
||||
eager_out = tf.py_function(self.forward_eager, [x], tf.float32)
|
||||
with tf.control_dependencies([eager_out]):
|
||||
eager_out.set_shape(x.shape)
|
||||
return eager_out
|
||||
|
||||
out = tf.keras.layers.Lambda(lambda_)(out)
|
||||
self.base_model = tf.keras.models.Model(inputs, [out, value_out])
|
||||
self.register_variables(self.base_model.variables)
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
out, self._value_out = self.base_model(input_dict["obs"], state,
|
||||
seq_lens)
|
||||
return out, []
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
def forward_eager(self, feature_layer):
|
||||
assert tf.executing_eagerly()
|
||||
if random.random() > 0.99:
|
||||
print("Eagerly printing the feature layer mean value",
|
||||
tf.reduce_mean(feature_layer))
|
||||
return feature_layer
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--stop-reward", type=float, default=150)
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
|
||||
|
||||
def policy_gradient_loss(policy, model, dist_class, train_batch):
|
||||
@@ -119,5 +69,14 @@ if __name__ == "__main__":
|
||||
"custom_model": "eager_model"
|
||||
},
|
||||
}
|
||||
stop = {
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"training_iteration": args.stop_iters,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
tune.run(MyTrainer, stop={"training_iteration": args.iters}, config=config)
|
||||
results = tune.run(MyTrainer, stop=stop, config=config)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
ray.shutdown()
|
||||
|
||||
+2
-2
@@ -15,9 +15,9 @@ class RockPaperScissors(MultiAgentEnv):
|
||||
SPOCK = 4
|
||||
|
||||
def __init__(self, config):
|
||||
self.action_space = Discrete(3)
|
||||
self.observation_space = Discrete(3)
|
||||
self.sheldon_cooper = config.get("sheldon_cooper", False)
|
||||
self.action_space = Discrete(5 if self.sheldon_cooper else 3)
|
||||
self.observation_space = Discrete(5 if self.sheldon_cooper else 3)
|
||||
self.player1 = "player1"
|
||||
self.player2 = "player2"
|
||||
self.last_move = None
|
||||
|
||||
@@ -28,13 +28,16 @@ import logging
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.tune import function
|
||||
from ray.rllib.examples.env.windy_maze_env import WindyMazeEnv, \
|
||||
HierarchicalWindyMazeEnv
|
||||
from ray.tune import function
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--flat", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-reward", type=float, default=0.0)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
@@ -45,8 +48,9 @@ if __name__ == "__main__":
|
||||
ray.init()
|
||||
|
||||
stop = {
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
if args.flat:
|
||||
@@ -92,15 +96,9 @@ if __name__ == "__main__":
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
results = tune.run(
|
||||
"PPO",
|
||||
stop=stop,
|
||||
config=config,
|
||||
)
|
||||
results = tune.run("PPO", stop=stop, config=config)
|
||||
|
||||
# Error if stop-reward not reached.
|
||||
if results.trials[0].last_result["episode_reward_mean"] < \
|
||||
args.stop_reward:
|
||||
raise ValueError("`stop-reward` of {} not reached!".format(
|
||||
args.stop_reward))
|
||||
print("ok")
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
# Explains/tests Issues:
|
||||
# https://github.com/ray-project/ray/issues/6928
|
||||
# https://github.com/ray-project/ray/issues/6732
|
||||
|
||||
import argparse
|
||||
from gym.spaces import Discrete, Box
|
||||
import numpy as np
|
||||
|
||||
from ray.rllib.agents.ppo import PPOTrainer
|
||||
from ray.rllib.examples.env.random_env import RandomEnv
|
||||
from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \
|
||||
MobileV2PlusRNNModel, TorchMobileV2PlusRNNModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
cnn_shape = (4, 4, 3)
|
||||
# The torch version of MobileNetV2 does channels first.
|
||||
cnn_shape_torch = (3, 224, 224)
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
# Register our custom model.
|
||||
ModelCatalog.register_custom_model(
|
||||
"my_model", TorchMobileV2PlusRNNModel
|
||||
if args.torch else MobileV2PlusRNNModel)
|
||||
|
||||
# Configure our Trainer.
|
||||
config = {
|
||||
"use_pytorch": args.torch,
|
||||
"model": {
|
||||
"custom_model": "my_model",
|
||||
# Extra config passed to the custom model's c'tor as kwargs.
|
||||
"custom_options": {
|
||||
"cnn_shape": cnn_shape_torch if args.torch else cnn_shape,
|
||||
},
|
||||
"max_seq_len": 20,
|
||||
},
|
||||
"vf_share_layers": True,
|
||||
"num_workers": 0, # no parallelism
|
||||
"env_config": {
|
||||
"action_space": Discrete(2),
|
||||
# Test a simple Image observation space.
|
||||
"observation_space": Box(
|
||||
0.0,
|
||||
1.0,
|
||||
shape=cnn_shape_torch if args.torch else cnn_shape,
|
||||
dtype=np.float32)
|
||||
},
|
||||
}
|
||||
|
||||
trainer = PPOTrainer(config=config, env=RandomEnv)
|
||||
print(trainer.train())
|
||||
@@ -111,8 +111,10 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module):
|
||||
|
||||
# Central VF maps (obs, opp_obs, opp_act) -> vf_pred
|
||||
input_size = 6 + 6 + 2 # obs + opp_obs + opp_act
|
||||
self.central_vf_dense = SlimFC(input_size, 16, activation_fn=nn.Tanh)
|
||||
self.central_vf_out = SlimFC(16, 1)
|
||||
self.central_vf = nn.Sequential(
|
||||
SlimFC(input_size, 16, activation_fn=nn.Tanh),
|
||||
SlimFC(16, 1),
|
||||
)
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
@@ -122,10 +124,9 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module):
|
||||
def central_value_function(self, obs, opponent_obs, opponent_actions):
|
||||
input_ = torch.cat([
|
||||
obs, opponent_obs,
|
||||
torch.nn.functional.one_hot(opponent_actions, 2)
|
||||
torch.nn.functional.one_hot(opponent_actions, 2).float()
|
||||
], 1)
|
||||
return torch.reshape(
|
||||
self.central_vf_out(self.central_vf_dense(input_)), [-1])
|
||||
return torch.reshape(self.central_vf(input_), [-1])
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
|
||||
@@ -0,0 +1,164 @@
|
||||
from ray.rllib.models.model import Model, restore_original_dimensions
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.tf_action_dist import Categorical
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from ray.rllib.models.torch.torch_action_dist import TorchCategorical
|
||||
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
|
||||
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.offline import JsonReader
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, nn = try_import_torch()
|
||||
|
||||
|
||||
class CustomLossModel(TFModelV2):
|
||||
"""Custom model that adds an imitation loss on top of the policy loss."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name):
|
||||
super().__init__(obs_space, action_space, num_outputs, model_config,
|
||||
name)
|
||||
|
||||
self.fcnet = FullyConnectedNetwork(
|
||||
self.obs_space,
|
||||
self.action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name="fcnet")
|
||||
self.register_variables(self.fcnet.variables())
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
# Delegate to our FCNet.
|
||||
return self.fcnet(input_dict, state, seq_lens)
|
||||
|
||||
@override(ModelV2)
|
||||
def custom_loss(self, policy_loss, loss_inputs):
|
||||
# Create a new input reader per worker.
|
||||
reader = JsonReader(self.model_config["custom_options"]["input_files"])
|
||||
input_ops = reader.tf_input_ops()
|
||||
|
||||
# Define a secondary loss by building a graph copy with weight sharing.
|
||||
obs = restore_original_dimensions(
|
||||
tf.cast(input_ops["obs"], tf.float32), self.obs_space)
|
||||
logits, _ = self.forward({"obs": obs}, [], None)
|
||||
|
||||
# You can also add self-supervised losses easily by referencing tensors
|
||||
# created during _build_layers_v2(). For example, an autoencoder-style
|
||||
# loss can be added as follows:
|
||||
# ae_loss = squared_diff(
|
||||
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
|
||||
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
|
||||
|
||||
# Compute the IL loss.
|
||||
action_dist = Categorical(logits, self.model_config)
|
||||
self.policy_loss = policy_loss
|
||||
self.imitation_loss = tf.reduce_mean(
|
||||
-action_dist.logp(input_ops["actions"]))
|
||||
return policy_loss + 10 * self.imitation_loss
|
||||
|
||||
def custom_stats(self):
|
||||
return {
|
||||
"policy_loss": self.policy_loss,
|
||||
"imitation_loss": self.imitation_loss,
|
||||
}
|
||||
|
||||
|
||||
class DeprecatedCustomLossModelV1(Model):
|
||||
"""Model(V1) version of above custom-loss model."""
|
||||
|
||||
def _build_layers_v2(self, input_dict, num_outputs, options):
|
||||
self.obs_in = input_dict["obs"]
|
||||
with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
|
||||
self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
|
||||
self.action_space, num_outputs,
|
||||
options)
|
||||
return self.fcnet.outputs, self.fcnet.last_layer
|
||||
|
||||
def custom_loss(self, policy_loss, loss_inputs):
|
||||
# create a new input reader per worker
|
||||
reader = JsonReader(self.options["custom_options"]["input_files"])
|
||||
input_ops = reader.tf_input_ops()
|
||||
|
||||
# define a secondary loss by building a graph copy with weight sharing
|
||||
obs = tf.cast(input_ops["obs"], tf.float32)
|
||||
logits, _ = self._build_layers_v2({
|
||||
"obs": restore_original_dimensions(obs, self.obs_space)
|
||||
}, self.num_outputs, self.options)
|
||||
|
||||
# You can also add self-supervised losses easily by referencing tensors
|
||||
# created during _build_layers_v2(). For example, an autoencoder-style
|
||||
# loss can be added as follows:
|
||||
# ae_loss = squared_diff(
|
||||
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
|
||||
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
|
||||
|
||||
# compute the IL loss
|
||||
action_dist = Categorical(logits, self.options)
|
||||
self.policy_loss = policy_loss
|
||||
self.imitation_loss = tf.reduce_mean(
|
||||
-action_dist.logp(input_ops["actions"]))
|
||||
return policy_loss + 10 * self.imitation_loss
|
||||
|
||||
def custom_stats(self):
|
||||
return {
|
||||
"policy_loss": self.policy_loss,
|
||||
"imitation_loss": self.imitation_loss,
|
||||
}
|
||||
|
||||
|
||||
class TorchCustomLossModel(TorchModelV2, nn.Module):
|
||||
"""PyTorch version of the CustomLossModel above."""
|
||||
|
||||
def __init__(self, obs_space, action_space, num_outputs, model_config,
|
||||
name, input_files):
|
||||
super().__init__(obs_space, action_space, num_outputs, model_config,
|
||||
name)
|
||||
nn.Module.__init__(self)
|
||||
|
||||
self.input_files = input_files
|
||||
self.fcnet = TorchFC(
|
||||
self.obs_space,
|
||||
self.action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name="fcnet")
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
# Delegate to our FCNet.
|
||||
return self.fcnet(input_dict, state, seq_lens)
|
||||
|
||||
@override(ModelV2)
|
||||
def custom_loss(self, policy_loss, loss_inputs):
|
||||
# Create a new input reader per worker.
|
||||
reader = JsonReader(self.input_files)
|
||||
input_ops = reader.tf_input_ops()
|
||||
|
||||
# Define a secondary loss by building a graph copy with weight sharing.
|
||||
obs = restore_original_dimensions(
|
||||
tf.cast(input_ops["obs"], tf.float32), self.obs_space)
|
||||
logits, _ = self.forward({"obs": obs}, [], None)
|
||||
|
||||
# You can also add self-supervised losses easily by referencing tensors
|
||||
# created during _build_layers_v2(). For example, an autoencoder-style
|
||||
# loss can be added as follows:
|
||||
# ae_loss = squared_diff(
|
||||
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
|
||||
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
|
||||
|
||||
# Compute the IL loss.
|
||||
action_dist = TorchCategorical(logits, self.model_config)
|
||||
self.policy_loss = policy_loss
|
||||
self.imitation_loss = torch.mean(
|
||||
-action_dist.logp(input_ops["actions"]))
|
||||
return policy_loss + 10 * self.imitation_loss
|
||||
|
||||
def custom_stats(self):
|
||||
return {
|
||||
"policy_loss": self.policy_loss,
|
||||
"imitation_loss": self.imitation_loss,
|
||||
}
|
||||
@@ -16,11 +16,11 @@ import random
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
|
||||
from ray.rllib.examples.models.shared_weights_model import \
|
||||
SharedWeightsModel1, SharedWeightsModel2, TorchSharedWeightsModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.modelv2 import ModelV2
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.rllib.utils import try_import_tf
|
||||
from ray.rllib.utils.annotations import override
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
|
||||
@@ -28,89 +28,31 @@ parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument("--num-agents", type=int, default=4)
|
||||
parser.add_argument("--num-policies", type=int, default=2)
|
||||
parser.add_argument("--num-iters", type=int, default=20)
|
||||
parser.add_argument("--stop-iters", type=int, default=20)
|
||||
parser.add_argument("--stop-reward", type=float, default=150)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--simple", action="store_true")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
class CustomModel1(TFModelV2):
|
||||
def __init__(self, observation_space, action_space, num_outputs,
|
||||
model_config, name):
|
||||
super().__init__(observation_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
|
||||
inputs = tf.keras.layers.Input(observation_space.shape)
|
||||
# Example of (optional) weight sharing between two different policies.
|
||||
# Here, we share the variables defined in the 'shared' variable scope
|
||||
# by entering it explicitly with tf.AUTO_REUSE. This creates the
|
||||
# variables for the 'fc1' layer in a global scope called 'shared'
|
||||
# outside of the policy's normal variable scope.
|
||||
with tf.variable_scope(
|
||||
tf.VariableScope(tf.AUTO_REUSE, "shared"),
|
||||
reuse=tf.AUTO_REUSE,
|
||||
auxiliary_name_scope=False):
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
units=64, activation=tf.nn.relu, name="fc1")(inputs)
|
||||
output = tf.keras.layers.Dense(
|
||||
units=num_outputs, activation=None, name="fc_out")(last_layer)
|
||||
vf = tf.keras.layers.Dense(
|
||||
units=1, activation=None, name="value_out")(last_layer)
|
||||
self.base_model = tf.keras.models.Model(inputs, [output, vf])
|
||||
self.register_variables(self.base_model.variables)
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
out, self._value_out = self.base_model(input_dict["obs"])
|
||||
return out, []
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
|
||||
class CustomModel2(TFModelV2):
|
||||
def __init__(self, observation_space, action_space, num_outputs,
|
||||
model_config, name):
|
||||
super().__init__(observation_space, action_space, num_outputs,
|
||||
model_config, name)
|
||||
|
||||
inputs = tf.keras.layers.Input(observation_space.shape)
|
||||
|
||||
# Weights shared with CustomModel1.
|
||||
with tf.variable_scope(
|
||||
tf.VariableScope(tf.AUTO_REUSE, "shared"),
|
||||
reuse=tf.AUTO_REUSE,
|
||||
auxiliary_name_scope=False):
|
||||
last_layer = tf.keras.layers.Dense(
|
||||
units=64, activation=tf.nn.relu, name="fc1")(inputs)
|
||||
output = tf.keras.layers.Dense(
|
||||
units=num_outputs, activation=None, name="fc_out")(last_layer)
|
||||
vf = tf.keras.layers.Dense(
|
||||
units=1, activation=None, name="value_out")(last_layer)
|
||||
self.base_model = tf.keras.models.Model(inputs, [output, vf])
|
||||
self.register_variables(self.base_model.variables)
|
||||
|
||||
@override(ModelV2)
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
out, self._value_out = self.base_model(input_dict["obs"])
|
||||
return out, []
|
||||
|
||||
@override(ModelV2)
|
||||
def value_function(self):
|
||||
return tf.reshape(self._value_out, [-1])
|
||||
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
|
||||
ModelCatalog.register_custom_model("model1", CustomModel1)
|
||||
ModelCatalog.register_custom_model("model2", CustomModel2)
|
||||
# Register the models to use.
|
||||
mod1 = TorchSharedWeightsModel if args.torch else SharedWeightsModel1
|
||||
mod2 = TorchSharedWeightsModel if args.torch else SharedWeightsModel2
|
||||
ModelCatalog.register_custom_model("model1", mod1)
|
||||
ModelCatalog.register_custom_model("model2", mod2)
|
||||
|
||||
# Get obs- and action Spaces.
|
||||
single_env = gym.make("CartPole-v0")
|
||||
obs_space = single_env.observation_space
|
||||
act_space = single_env.action_space
|
||||
|
||||
# Each policy can have a different configuration (including custom model)
|
||||
# Each policy can have a different configuration (including custom model).
|
||||
def gen_policy(i):
|
||||
config = {
|
||||
"model": {
|
||||
@@ -120,28 +62,35 @@ if __name__ == "__main__":
|
||||
}
|
||||
return (None, obs_space, act_space, config)
|
||||
|
||||
# Setup PPO with an ensemble of `num_policies` different policies
|
||||
# Setup PPO with an ensemble of `num_policies` different policies.
|
||||
policies = {
|
||||
"policy_{}".format(i): gen_policy(i)
|
||||
for i in range(args.num_policies)
|
||||
}
|
||||
policy_ids = list(policies.keys())
|
||||
|
||||
tune.run(
|
||||
"PPO",
|
||||
stop={"training_iteration": args.num_iters},
|
||||
config={
|
||||
"env": MultiAgentCartPole,
|
||||
"env_config": {
|
||||
"num_agents": args.num_agents,
|
||||
},
|
||||
"log_level": "DEBUG",
|
||||
"simple_optimizer": args.simple,
|
||||
"num_sgd_iter": 10,
|
||||
"multiagent": {
|
||||
"policies": policies,
|
||||
"policy_mapping_fn": (
|
||||
lambda agent_id: random.choice(policy_ids)),
|
||||
},
|
||||
config = {
|
||||
"env": MultiAgentCartPole,
|
||||
"env_config": {
|
||||
"num_agents": args.num_agents,
|
||||
},
|
||||
)
|
||||
"log_level": "DEBUG",
|
||||
"simple_optimizer": args.simple,
|
||||
"num_sgd_iter": 10,
|
||||
"multiagent": {
|
||||
"policies": policies,
|
||||
"policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
stop = {
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"training_iteration": args.stop_iters,
|
||||
}
|
||||
|
||||
results = tune.run("PPO", stop=stop, config=config)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
ray.shutdown()
|
||||
|
||||
@@ -18,32 +18,17 @@ import gym
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
|
||||
from ray.rllib.policy import Policy
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
|
||||
from ray.rllib.examples.policy.random_policy import RandomPolicy
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=20)
|
||||
|
||||
|
||||
class RandomPolicy(Policy):
|
||||
"""Hand-coded policy that returns random actions."""
|
||||
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
info_batch=None,
|
||||
episodes=None,
|
||||
**kwargs):
|
||||
"""Compute actions on a batch of observations."""
|
||||
return [self.action_space.sample() for _ in obs_batch], [], {}
|
||||
|
||||
def learn_on_batch(self, samples):
|
||||
"""No learning."""
|
||||
return {}
|
||||
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=20)
|
||||
parser.add_argument("--stop-reward", type=float, default=150)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
@@ -56,18 +41,32 @@ if __name__ == "__main__":
|
||||
obs_space = single_env.observation_space
|
||||
act_space = single_env.action_space
|
||||
|
||||
tune.run(
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
}
|
||||
|
||||
results = tune.run(
|
||||
"PG",
|
||||
stop={"training_iteration": args.num_iters},
|
||||
stop=stop,
|
||||
config={
|
||||
"env": "multi_agent_cartpole",
|
||||
"multiagent": {
|
||||
"policies": {
|
||||
"pg_policy": (None, obs_space, act_space, {}),
|
||||
"pg_policy": (None, obs_space, act_space, {
|
||||
"use_pytorch": args.torch
|
||||
}),
|
||||
"random": (RandomPolicy, obs_space, act_space, {}),
|
||||
},
|
||||
"policy_mapping_fn": (
|
||||
lambda agent_id: ["pg_policy", "random"][agent_id % 2]),
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
},
|
||||
)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -12,19 +12,27 @@ import argparse
|
||||
import gym
|
||||
|
||||
import ray
|
||||
from ray.rllib.agents.dqn.dqn import DQNTrainer
|
||||
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
|
||||
from ray.rllib.agents.ppo.ppo import PPOTrainer
|
||||
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
|
||||
from ray.rllib.agents.dqn import DQNTrainer, DQNTFPolicy, DQNTorchPolicy
|
||||
from ray.rllib.agents.ppo import PPOTrainer, PPOTFPolicy, PPOTorchPolicy
|
||||
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
|
||||
from ray.tune.logger import pretty_print
|
||||
from ray.tune.registry import register_env
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=20)
|
||||
# Use torch for both policies.
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
# Mix PPO=tf and DQN=torch if set.
|
||||
parser.add_argument("--mixed-torch-tf", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=20)
|
||||
parser.add_argument("--stop-reward", type=float, default=50)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
assert not (args.torch and args.mixed_torch_tf),\
|
||||
"Use either --torch or --mixed-torch-tf, not both!"
|
||||
|
||||
ray.init()
|
||||
|
||||
# Simple environment with 4 independent cartpole entities
|
||||
@@ -37,8 +45,10 @@ if __name__ == "__main__":
|
||||
# You can also have multiple policies per trainer, but here we just
|
||||
# show one each for PPO and DQN.
|
||||
policies = {
|
||||
"ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
|
||||
"dqn_policy": (DQNTFPolicy, obs_space, act_space, {}),
|
||||
"ppo_policy": (PPOTorchPolicy if args.torch else PPOTFPolicy,
|
||||
obs_space, act_space, {}),
|
||||
"dqn_policy": (DQNTorchPolicy if args.torch or args.mixed_torch_tf else
|
||||
DQNTFPolicy, obs_space, act_space, {}),
|
||||
}
|
||||
|
||||
def policy_mapping_fn(agent_id):
|
||||
@@ -59,6 +69,7 @@ if __name__ == "__main__":
|
||||
# disable filters, otherwise we would need to synchronize those
|
||||
# as well to the DQN agent
|
||||
"observation_filter": "NoFilter",
|
||||
"use_pytorch": args.torch,
|
||||
})
|
||||
|
||||
dqn_trainer = DQNTrainer(
|
||||
@@ -71,6 +82,7 @@ if __name__ == "__main__":
|
||||
},
|
||||
"gamma": 0.95,
|
||||
"n_step": 3,
|
||||
"use_pytorch": args.torch or args.mixed_torch_tf,
|
||||
})
|
||||
|
||||
# You should see both the printed X and Y approach 200 as this trains:
|
||||
@@ -78,17 +90,31 @@ if __name__ == "__main__":
|
||||
# policy_reward_mean:
|
||||
# dqn_policy: X
|
||||
# ppo_policy: Y
|
||||
for i in range(args.num_iters):
|
||||
for i in range(args.stop_iters):
|
||||
print("== Iteration", i, "==")
|
||||
|
||||
# improve the DQN policy
|
||||
print("-- DQN --")
|
||||
print(pretty_print(dqn_trainer.train()))
|
||||
result_dqn = dqn_trainer.train()
|
||||
print(pretty_print(result_dqn))
|
||||
|
||||
# improve the PPO policy
|
||||
print("-- PPO --")
|
||||
print(pretty_print(ppo_trainer.train()))
|
||||
result_ppo = ppo_trainer.train()
|
||||
print(pretty_print(result_ppo))
|
||||
|
||||
# Test passed gracefully.
|
||||
if args.as_test and \
|
||||
result_dqn["episode_reward_mean"] > args.stop_reward and \
|
||||
result_ppo["episode_reward_mean"] > args.stop_reward:
|
||||
print("test passed (both agents above requested reward)")
|
||||
quit(0)
|
||||
|
||||
# swap weights to synchronize
|
||||
dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
|
||||
ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))
|
||||
|
||||
# Desired reward not reached.
|
||||
if args.as_test:
|
||||
raise ValueError("Desired reward ({}) not reached!".format(
|
||||
args.stop_reward))
|
||||
|
||||
@@ -1,22 +1,20 @@
|
||||
import argparse
|
||||
from gym.spaces import Dict, Tuple, Box, Discrete
|
||||
import sys
|
||||
|
||||
import ray
|
||||
import ray.tune as tune
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.examples.env.nested_space_repeat_after_me_env import \
|
||||
NestedSpaceRepeatAfterMeEnv
|
||||
from ray.rllib.utils import try_import_tree
|
||||
from ray.rllib.utils.framework import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
tree = try_import_tree()
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop", type=int, default=90)
|
||||
parser.add_argument("--max-trainstop", type=int, default=90)
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-reward", type=float, default=0.0)
|
||||
parser.add_argument("--stop-iters", type=int, default=100)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
if __name__ == "__main__":
|
||||
@@ -40,19 +38,23 @@ if __name__ == "__main__":
|
||||
},
|
||||
"entropy_coeff": 0.00005, # We don't want high entropy in this Env.
|
||||
"gamma": 0.0, # No history in Env (bandit problem).
|
||||
"lr": 0.0003,
|
||||
"lr": 0.0005,
|
||||
"num_envs_per_worker": 20,
|
||||
"num_sgd_iter": 20,
|
||||
"num_sgd_iter": 4,
|
||||
"num_workers": 0,
|
||||
"use_pytorch": args.torch,
|
||||
"vf_loss_coeff": 0.01,
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
import ray.rllib.agents.ppo as ppo
|
||||
trainer = ppo.PPOTrainer(config=config)
|
||||
for _ in range(100):
|
||||
results = trainer.train()
|
||||
print(results)
|
||||
if results["episode_reward_mean"] > args.stop:
|
||||
sys.exit(0) # Learnt, exit gracefully.
|
||||
sys.exit(1) # Done, but did not learn, exit with error.
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -15,83 +15,34 @@ Working configurations are given below.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from gym.spaces import Box
|
||||
|
||||
import ray
|
||||
from ray import tune
|
||||
from ray.rllib.agents.dqn.distributional_q_tf_model import \
|
||||
DistributionalQTFModel
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.examples.env.parametric_actions_cartpole import \
|
||||
ParametricActionsCartPole
|
||||
from ray.rllib.examples.models.parametric_actions_model import \
|
||||
ParametricActionsModel, TorchParametricActionsModel
|
||||
from ray.rllib.models import ModelCatalog
|
||||
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
|
||||
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
|
||||
from ray.tune.registry import register_env
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
tf = try_import_tf()
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=200)
|
||||
parser.add_argument("--run", type=str, default="PPO")
|
||||
|
||||
|
||||
class ParametricActionsModel(DistributionalQTFModel, TFModelV2):
|
||||
"""Parametric action model that handles the dot product and masking.
|
||||
|
||||
This assumes the outputs are logits for a single Categorical action dist.
|
||||
Getting this to work with a more complex output (e.g., if the action space
|
||||
is a tuple of several distributions) is also possible but left as an
|
||||
exercise to the reader.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
obs_space,
|
||||
action_space,
|
||||
num_outputs,
|
||||
model_config,
|
||||
name,
|
||||
true_obs_shape=(4, ),
|
||||
action_embed_size=2,
|
||||
**kw):
|
||||
super(ParametricActionsModel, self).__init__(
|
||||
obs_space, action_space, num_outputs, model_config, name, **kw)
|
||||
self.action_embed_model = FullyConnectedNetwork(
|
||||
Box(-1, 1, shape=true_obs_shape), action_space, action_embed_size,
|
||||
model_config, name + "_action_embed")
|
||||
self.register_variables(self.action_embed_model.variables())
|
||||
|
||||
def forward(self, input_dict, state, seq_lens):
|
||||
# Extract the available actions tensor from the observation.
|
||||
avail_actions = input_dict["obs"]["avail_actions"]
|
||||
action_mask = input_dict["obs"]["action_mask"]
|
||||
|
||||
# Compute the predicted action embedding
|
||||
action_embed, _ = self.action_embed_model({
|
||||
"obs": input_dict["obs"]["cart"]
|
||||
})
|
||||
|
||||
# Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
|
||||
# avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
|
||||
intent_vector = tf.expand_dims(action_embed, 1)
|
||||
|
||||
# Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
|
||||
action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)
|
||||
|
||||
# Mask out invalid actions (use tf.float32.min for stability)
|
||||
inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
|
||||
return action_logits + inf_mask, state
|
||||
|
||||
def value_function(self):
|
||||
return self.action_embed_model.value_function()
|
||||
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=200)
|
||||
parser.add_argument("--stop-reward", type=float, default=150.0)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
ray.init()
|
||||
|
||||
ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
|
||||
register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
|
||||
ModelCatalog.register_custom_model(
|
||||
"pa_model", TorchParametricActionsModel
|
||||
if args.torch else ParametricActionsModel)
|
||||
|
||||
if args.run == "DQN":
|
||||
cfg = {
|
||||
# TODO(ekl) we need to set these to prevent the masked values
|
||||
@@ -103,16 +54,25 @@ if __name__ == "__main__":
|
||||
}
|
||||
else:
|
||||
cfg = {}
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={
|
||||
"episode_reward_mean": args.stop,
|
||||
|
||||
config = dict({
|
||||
"env": "pa_cartpole",
|
||||
"model": {
|
||||
"custom_model": "pa_model",
|
||||
},
|
||||
config=dict({
|
||||
"env": "pa_cartpole",
|
||||
"model": {
|
||||
"custom_model": "pa_model",
|
||||
},
|
||||
"num_workers": 0,
|
||||
}, **cfg),
|
||||
)
|
||||
"num_workers": 0,
|
||||
"use_pytorch": args.torch,
|
||||
}, **cfg)
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(args.run, stop=stop, config=config)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -8,99 +8,41 @@ This demonstrates running the following policies in competition:
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import random
|
||||
from gym.spaces import Discrete
|
||||
import random
|
||||
|
||||
from ray import tune
|
||||
from ray.rllib.agents.pg.pg import PGTrainer
|
||||
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
|
||||
from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy
|
||||
from ray.rllib.agents.registry import get_agent_class
|
||||
from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.utils import try_import_tf
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=1000)
|
||||
from ray.rllib.examples.policy.rock_paper_scissors_dummies import \
|
||||
BeatLastHeuristic, AlwaysSameHeuristic
|
||||
from ray.rllib.utils.framework import try_import_tf, try_import_torch
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
tf = try_import_tf()
|
||||
torch, _ = try_import_torch()
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=150)
|
||||
parser.add_argument("--stop-reward", type=float, default=1000.0)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
|
||||
class AlwaysSameHeuristic(Policy):
|
||||
"""Pick a random move and stick with it for the entire episode."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.exploration = self._create_exploration()
|
||||
|
||||
def get_initial_state(self):
|
||||
return [
|
||||
random.choice([
|
||||
RockPaperScissors.ROCK, RockPaperScissors.PAPER,
|
||||
RockPaperScissors.SCISSORS
|
||||
])
|
||||
]
|
||||
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
info_batch=None,
|
||||
episodes=None,
|
||||
**kwargs):
|
||||
return state_batches[0], state_batches, {}
|
||||
|
||||
def learn_on_batch(self, samples):
|
||||
pass
|
||||
|
||||
def get_weights(self):
|
||||
pass
|
||||
|
||||
def set_weights(self, weights):
|
||||
pass
|
||||
|
||||
|
||||
class BeatLastHeuristic(Policy):
|
||||
"""Play the move that would beat the last move of the opponent."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self.exploration = self._create_exploration()
|
||||
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
prev_action_batch=None,
|
||||
prev_reward_batch=None,
|
||||
info_batch=None,
|
||||
episodes=None,
|
||||
**kwargs):
|
||||
def successor(x):
|
||||
if x[RockPaperScissors.ROCK] == 1:
|
||||
return RockPaperScissors.PAPER
|
||||
elif x[RockPaperScissors.PAPER] == 1:
|
||||
return RockPaperScissors.SCISSORS
|
||||
elif x[RockPaperScissors.SCISSORS] == 1:
|
||||
return RockPaperScissors.ROCK
|
||||
|
||||
return [successor(x) for x in obs_batch], [], {}
|
||||
|
||||
def learn_on_batch(self, samples):
|
||||
pass
|
||||
|
||||
def get_weights(self):
|
||||
pass
|
||||
|
||||
def set_weights(self, weights):
|
||||
pass
|
||||
|
||||
|
||||
def run_same_policy(args):
|
||||
def run_same_policy(args, stop):
|
||||
"""Use the same policy for both agents (trivial case)."""
|
||||
config = {
|
||||
"env": RockPaperScissors,
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
tune.run(
|
||||
"PG",
|
||||
stop={"timesteps_total": args.stop},
|
||||
config={"env": RockPaperScissors})
|
||||
results = tune.run("PG", config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
# Check vs 0.0 as we are playing a zero-sum game.
|
||||
check_learning_achieved(results, 0.0)
|
||||
|
||||
|
||||
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
|
||||
@@ -134,16 +76,35 @@ def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
|
||||
"learned": (None, Discrete(3), Discrete(3), {
|
||||
"model": {
|
||||
"use_lstm": use_lstm
|
||||
}
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}),
|
||||
},
|
||||
"policy_mapping_fn": select_policy,
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
tune.run(trainer, stop={"timesteps_total": args.stop}, config=config)
|
||||
cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer
|
||||
trainer_obj = cls(config=config)
|
||||
env = trainer_obj.workers.local_worker().env
|
||||
for _ in range(args.stop_iters):
|
||||
results = trainer_obj.train()
|
||||
print(results)
|
||||
# Timesteps reached.
|
||||
if results["timesteps_total"] > args.stop_timesteps:
|
||||
break
|
||||
# Reward (difference) reached -> all good, return.
|
||||
elif env.player1_score - env.player2_score > args.stop_reward:
|
||||
return
|
||||
|
||||
# Reward (difference) not reached: Error if `as_test`.
|
||||
if args.as_test:
|
||||
raise ValueError(
|
||||
"Desired reward difference ({}) not reached! Only got to {}.".
|
||||
format(args.stop_reward, env.player1_score - env.player2_score))
|
||||
|
||||
|
||||
def run_with_custom_entropy_loss(args):
|
||||
def run_with_custom_entropy_loss(args, stop):
|
||||
"""Example of customizing the loss function of an existing policy.
|
||||
|
||||
This performs about the same as the default loss does."""
|
||||
@@ -151,24 +112,44 @@ def run_with_custom_entropy_loss(args):
|
||||
def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
|
||||
logits, _ = model.from_batch(train_batch)
|
||||
action_dist = dist_class(logits, model)
|
||||
return (-0.1 * action_dist.entropy() - tf.reduce_mean(
|
||||
action_dist.logp(train_batch["actions"]) *
|
||||
train_batch["advantages"]))
|
||||
if args.torch:
|
||||
# required by PGTorchPolicy's stats fn.
|
||||
policy.pi_err = torch.tensor([0.0])
|
||||
return torch.mean(-0.1 * action_dist.entropy() -
|
||||
(action_dist.logp(train_batch["actions"]) *
|
||||
train_batch["advantages"]))
|
||||
else:
|
||||
return (-0.1 * action_dist.entropy() - tf.reduce_mean(
|
||||
action_dist.logp(train_batch["actions"]) *
|
||||
train_batch["advantages"]))
|
||||
|
||||
EntropyPolicy = PGTFPolicy.with_updates(
|
||||
policy_cls = PGTorchPolicy if args.torch else PGTFPolicy
|
||||
EntropyPolicy = policy_cls.with_updates(
|
||||
loss_fn=entropy_policy_gradient_loss)
|
||||
|
||||
EntropyLossPG = PGTrainer.with_updates(
|
||||
name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
|
||||
|
||||
run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
run_same_policy(args, stop=stop)
|
||||
print("run_same_policy: ok.")
|
||||
|
||||
run_heuristic_vs_learned(args, use_lstm=False)
|
||||
print("run_heuristic_vs_learned(w/o lstm): ok.")
|
||||
run_same_policy(args)
|
||||
print("run_same_policy: ok.")
|
||||
|
||||
run_heuristic_vs_learned(args, use_lstm=True)
|
||||
print("run_heuristic_vs_learned (w/ lstm): ok.")
|
||||
run_with_custom_entropy_loss(args)
|
||||
|
||||
run_with_custom_entropy_loss(args, stop=stop)
|
||||
print("run_with_custom_entropy_loss: ok.")
|
||||
|
||||
@@ -13,8 +13,8 @@ import ray
|
||||
from ray import tune
|
||||
from ray.rllib.evaluation import RolloutWorker
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.policy.policy import Policy
|
||||
from ray.rllib.policy.sample_batch import SampleBatch
|
||||
from ray.rllib.policy.tests.test_policy import TestPolicy
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--gpu", action="store_true")
|
||||
@@ -23,7 +23,7 @@ parser.add_argument("--num-workers", type=int, default=2)
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
|
||||
|
||||
class CustomPolicy(TestPolicy):
|
||||
class CustomPolicy(Policy):
|
||||
"""Example of a custom policy written from scratch.
|
||||
|
||||
You might find it more convenient to extend TF/TorchPolicy instead
|
||||
|
||||
@@ -13,8 +13,10 @@ from ray import tune
|
||||
from ray.rllib.agents.trainer_template import build_trainer
|
||||
from ray.rllib.agents.dqn.dqn import DEFAULT_CONFIG as DQN_CONFIG
|
||||
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
|
||||
from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy
|
||||
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG as PPO_CONFIG
|
||||
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
|
||||
from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
|
||||
from ray.rllib.evaluation.worker_set import WorkerSet
|
||||
from ray.rllib.execution.common import _get_shared_metrics
|
||||
from ray.rllib.execution.concurrency_ops import Concurrently
|
||||
@@ -25,10 +27,16 @@ from ray.rllib.execution.replay_ops import StoreToReplayBuffer, Replay
|
||||
from ray.rllib.execution.train_ops import TrainOneStep, UpdateTargetNetwork
|
||||
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
|
||||
from ray.rllib.optimizers.async_replay_optimizer import LocalReplayBuffer
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
from ray.tune.registry import register_env
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--num-iters", type=int, default=20)
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--mixed-torch-tf", action="store_true")
|
||||
parser.add_argument("--stop-iters", type=int, default=20)
|
||||
parser.add_argument("--stop-reward", type=float, default=150.0)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=100000)
|
||||
|
||||
|
||||
def custom_training_workflow(workers: WorkerSet, config: dict):
|
||||
@@ -90,6 +98,9 @@ def custom_training_workflow(workers: WorkerSet, config: dict):
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
assert not (args.torch and args.mixed_torch_tf),\
|
||||
"Use either --torch or --mixed-torch-tf, not both!"
|
||||
|
||||
ray.init()
|
||||
|
||||
# Simple environment with 4 independent cartpole entities
|
||||
@@ -102,8 +113,10 @@ if __name__ == "__main__":
|
||||
# Note that since the trainer below does not include a default policy or
|
||||
# policy configs, we have to explicitly set it in the multiagent config:
|
||||
policies = {
|
||||
"ppo_policy": (PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
|
||||
"dqn_policy": (DQNTFPolicy, obs_space, act_space, DQN_CONFIG),
|
||||
"ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else
|
||||
PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
|
||||
"dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy,
|
||||
obs_space, act_space, DQN_CONFIG),
|
||||
}
|
||||
|
||||
def policy_mapping_fn(agent_id):
|
||||
@@ -117,16 +130,27 @@ if __name__ == "__main__":
|
||||
default_policy=None,
|
||||
execution_plan=custom_training_workflow)
|
||||
|
||||
tune.run(
|
||||
MyTrainer,
|
||||
stop={"training_iteration": args.num_iters},
|
||||
config={
|
||||
"rollout_fragment_length": 50,
|
||||
"num_workers": 0,
|
||||
"env": "multi_agent_cartpole",
|
||||
"multiagent": {
|
||||
"policies": policies,
|
||||
"policy_mapping_fn": policy_mapping_fn,
|
||||
"policies_to_train": ["dqn_policy", "ppo_policy"],
|
||||
},
|
||||
})
|
||||
config = {
|
||||
"rollout_fragment_length": 50,
|
||||
"num_workers": 0,
|
||||
"env": "multi_agent_cartpole",
|
||||
"multiagent": {
|
||||
"policies": policies,
|
||||
"policy_mapping_fn": policy_mapping_fn,
|
||||
"policies_to_train": ["dqn_policy", "ppo_policy"],
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
|
||||
stop = {
|
||||
"training_iteration": args.stop_iters,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
}
|
||||
|
||||
results = tune.run(MyTrainer, config=config, stop=stop)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -17,11 +17,15 @@ from ray import tune
|
||||
from ray.tune import register_env, grid_search
|
||||
from ray.rllib.env.multi_agent_env import ENV_STATE
|
||||
from ray.rllib.examples.env.two_step_game import TwoStepGame
|
||||
from ray.rllib.utils.test_utils import check_learning_achieved
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--stop", type=int, default=50000)
|
||||
parser.add_argument("--run", type=str, default="PG")
|
||||
parser.add_argument("--num-cpus", type=int, default=0)
|
||||
parser.add_argument("--as-test", action="store_true")
|
||||
parser.add_argument("--torch", action="store_true")
|
||||
parser.add_argument("--stop-reward", type=float, default=7.0)
|
||||
parser.add_argument("--stop-timesteps", type=int, default=50000)
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
@@ -73,6 +77,7 @@ if __name__ == "__main__":
|
||||
},
|
||||
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
group = False
|
||||
elif args.run == "QMIX":
|
||||
@@ -87,6 +92,7 @@ if __name__ == "__main__":
|
||||
"separate_state_space": True,
|
||||
"one_hot_state_encoding": True
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
group = True
|
||||
elif args.run == "APEX_QMIX":
|
||||
@@ -107,6 +113,7 @@ if __name__ == "__main__":
|
||||
"separate_state_space": True,
|
||||
"one_hot_state_encoding": True
|
||||
},
|
||||
"use_pytorch": args.torch,
|
||||
}
|
||||
group = True
|
||||
else:
|
||||
@@ -114,12 +121,19 @@ if __name__ == "__main__":
|
||||
group = False
|
||||
|
||||
ray.init(num_cpus=args.num_cpus or None)
|
||||
tune.run(
|
||||
args.run,
|
||||
stop={
|
||||
"timesteps_total": args.stop,
|
||||
},
|
||||
config=dict(config, **{
|
||||
"env": "grouped_twostep" if group else TwoStepGame,
|
||||
}),
|
||||
)
|
||||
|
||||
stop = {
|
||||
"episode_reward_mean": args.stop_reward,
|
||||
"timesteps_total": args.stop_timesteps,
|
||||
}
|
||||
|
||||
config = dict(config, **{
|
||||
"env": "grouped_twostep" if group else TwoStepGame,
|
||||
})
|
||||
|
||||
results = tune.run(args.run, stop=stop, config=config)
|
||||
|
||||
if args.as_test:
|
||||
check_learning_achieved(results, args.stop_reward)
|
||||
|
||||
ray.shutdown()
|
||||
|
||||
@@ -7,16 +7,16 @@ from ray.tune.registry import register_env
|
||||
from ray.rllib.agents.pg import PGTrainer
|
||||
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
|
||||
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
|
||||
from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
|
||||
from ray.rllib.evaluation.rollout_worker import RolloutWorker
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.evaluation.worker_set import WorkerSet
|
||||
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole, \
|
||||
BasicMultiAgent, EarlyDoneMultiAgent, RoundRobinMultiAgent
|
||||
from ray.rllib.examples.policy.random_policy import RandomPolicy
|
||||
from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
|
||||
AsyncGradientsOptimizer)
|
||||
from ray.rllib.tests.test_rollout_worker import MockPolicy
|
||||
from ray.rllib.evaluation.rollout_worker import RolloutWorker
|
||||
from ray.rllib.policy.tests.test_policy import TestPolicy
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.evaluation.worker_set import WorkerSet
|
||||
from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
|
||||
|
||||
|
||||
def one_hot(i, n):
|
||||
@@ -297,7 +297,7 @@ class TestMultiAgentEnv(unittest.TestCase):
|
||||
def test_custom_rnn_state_values(self):
|
||||
h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}
|
||||
|
||||
class StatefulPolicy(TestPolicy):
|
||||
class StatefulPolicy(RandomPolicy):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
|
||||
@@ -13,13 +13,13 @@ from ray.rllib.env.vector_env import VectorEnv
|
||||
from ray.rllib.evaluation.rollout_worker import RolloutWorker
|
||||
from ray.rllib.evaluation.metrics import collect_metrics
|
||||
from ray.rllib.evaluation.postprocessing import compute_advantages
|
||||
from ray.rllib.policy.tests.test_policy import TestPolicy
|
||||
from ray.rllib.examples.policy.random_policy import RandomPolicy
|
||||
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
|
||||
from ray.rllib.utils.test_utils import check
|
||||
from ray.tune.registry import register_env
|
||||
|
||||
|
||||
class MockPolicy(TestPolicy):
|
||||
class MockPolicy(RandomPolicy):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
@@ -40,7 +40,7 @@ class MockPolicy(TestPolicy):
|
||||
batch, 100.0, 0.9, use_gae=False, use_critic=False)
|
||||
|
||||
|
||||
class BadPolicy(MockPolicy):
|
||||
class BadPolicy(RandomPolicy):
|
||||
def compute_actions(self,
|
||||
obs_batch,
|
||||
state_batches=None,
|
||||
|
||||
Reference in New Issue
Block a user