[RLlib] Examples folder restructuring (Model examples; final part). (#8278)

- This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models.
- It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward.
This commit is contained in:
Sven Mika
2020-05-12 08:23:10 +02:00
committed by GitHub
parent 9d012626e5
commit 57544b1ff9
41 changed files with 1466 additions and 1584 deletions
+7 -7
View File
@@ -219,14 +219,14 @@ matrix:
- . ./ci/travis/ci.sh build
script:
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_A,examples_B rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_L,examples_M,examples_N,examples_P rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_U,examples_R,examples_S,examples_T rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C,examples_D rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z rllib/...
# RLlib: tests_dir: Everything in rllib/tests/ directory (A-I).
- os: linux
env:
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_I=1
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_L=1
- PYTHON=3.6
- TF_VERSION=2.0.0b1
- TFP_VERSION=0.8
@@ -237,12 +237,12 @@ matrix:
before_script:
- . ./ci/travis/ci.sh build
script:
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L rllib/...
# RLlib: tests_dir: Everything in rllib/tests/ directory (J-Z).
- os: linux
env:
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_J_TO_Z=1
- RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_M_TO_Z=1
- PYTHON=3.6
- TF_VERSION=2.0.0b1
- TFP_VERSION=0.8
@@ -253,7 +253,7 @@ matrix:
before_script:
- . ./ci/travis/ci.sh build
script:
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_J,tests_dir_K,tests_dir_L,tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/...
- ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/...
# Cpp worker test
- os: linux
+378 -122
View File
@@ -27,8 +27,8 @@
# 2) everything in a) using tf1.x
# 3) everything in b) c) d) and e)
# 4) everything in g)
# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_I`
# 6) f), BUT only those tagged `tests_dir_J` to `tests_dir_Z`
# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_L`
# 6) f), BUT only those tagged `tests_dir_M` to `tests_dir_Z`
# --------------------------------------------------------------------
@@ -1453,95 +1453,215 @@ py_test(
py_test(
name = "examples/autoregressive_action_dist", main = "examples/autoregressive_action_dist.py",
name = "examples/autoregressive_action_dist_tf",
main = "examples/autoregressive_action_dist.py",
tags = ["examples", "examples_A"],
size = "large",
size = "medium",
srcs = ["examples/autoregressive_action_dist.py"],
args = ["--stop=150", "--num-cpus=4"]
args = ["--as-test", "--stop-reward=150", "--num-cpus=4"]
)
py_test(
name = "examples/batch_norm_model_ppo", main="examples/batch_norm_model.py",
name = "examples/autoregressive_action_dist_torch",
main = "examples/autoregressive_action_dist.py",
tags = ["examples", "examples_A"],
size = "medium",
srcs = ["examples/autoregressive_action_dist.py"],
args = ["--as-test", "--torch", "--stop-reward=150", "--num-cpus=4"]
)
py_test(
name = "examples/batch_norm_model_ppo_tf",
main = "examples/batch_norm_model.py",
tags = ["examples", "examples_B"],
size = "medium",
srcs = ["examples/batch_norm_model.py"],
args = ["--run=PPO", "--num-iters=1"]
args = ["--as-test", "--run=PPO", "--stop-reward=80"]
)
py_test(
name = "examples/batch_norm_model_pg", main="examples/batch_norm_model.py",
name = "examples/batch_norm_model_ppo_torch",
main = "examples/batch_norm_model.py",
tags = ["examples", "examples_B"],
size = "medium",
srcs = ["examples/batch_norm_model.py"],
args = ["--run=PG", "--num-iters=1"]
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=80"]
)
py_test(
name = "examples/batch_norm_model_dqn", main="examples/batch_norm_model.py",
name = "examples/batch_norm_model_dqn_tf",
main = "examples/batch_norm_model.py",
tags = ["examples", "examples_B"],
size = "medium",
size = "medium", # DQN learns much slower with BatchNorm.
srcs = ["examples/batch_norm_model.py"],
args = ["--run=DQN", "--num-iters=1"]
args = ["--as-test", "--run=DQN", "--stop-reward=70"]
)
py_test(
name = "examples/batch_norm_model_ddpg", main="examples/batch_norm_model.py",
name = "examples/batch_norm_model_dqn_torch",
main = "examples/batch_norm_model.py",
tags = ["examples", "examples_B"],
size = "medium",
size = "medium", # DQN learns much slower with BatchNorm.
srcs = ["examples/batch_norm_model.py"],
args = ["--run=DDPG", "--num-iters=1"]
args = ["--as-test", "--torch", "--run=DQN", "--stop-reward=70"]
)
py_test(
name = "examples/cartpole_lstm_impala", main="examples/cartpole_lstm.py",
name = "examples/batch_norm_model_ddpg_tf",
main = "examples/batch_norm_model.py",
tags = ["examples", "examples_B"],
size = "small",
srcs = ["examples/batch_norm_model.py"],
args = ["--run=DDPG", "--stop-iters=1"]
)
py_test(
name = "examples/batch_norm_model_ddpg_torch",
main = "examples/batch_norm_model.py",
tags = ["examples", "examples_B"],
size = "small",
srcs = ["examples/batch_norm_model.py"],
args = ["--torch", "--run=DDPG", "--stop-iters=1"]
)
py_test(
name = "examples/cartpole_lstm_impala_tf",
main = "examples/cartpole_lstm.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/cartpole_lstm.py"],
args = ["--as-test", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"]
)
py_test(
name = "examples/cartpole_lstm_impala_torch",
main = "examples/cartpole_lstm.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/cartpole_lstm.py"],
args = ["--run=IMPALA", "--stop=40", "--num-cpus=4"]
args = ["--as-test", "--torch", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"]
)
py_test(
name = "examples/cartpole_lstm_ppo", main="examples/cartpole_lstm.py",
name = "examples/cartpole_lstm_ppo_tf",
main = "examples/cartpole_lstm.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/cartpole_lstm.py"],
args = ["--run=PPO", "--stop=40", "--num-cpus=4"]
args = ["--as-test", "--run=PPO", "--stop-reward=40", "--num-cpus=4"]
)
py_test(
name = "examples/cartpole_lstm_ppo_with_prev_a_and_r", main="examples/cartpole_lstm.py",
name = "examples/cartpole_lstm_ppo_torch",
main = "examples/cartpole_lstm.py",
tags = ["examples", "examples_C"],
size = "large",
size = "small",
srcs = ["examples/cartpole_lstm.py"],
args = ["--run=PPO", "--stop=40", "--use-prev-action-reward", "--num-cpus=4"]
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--num-cpus=4"]
)
py_test(
name = "examples/centralized_critic",
name = "examples/cartpole_lstm_ppo_tf_with_prev_a_and_r",
main = "examples/cartpole_lstm.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/cartpole_lstm.py"],
args = ["--as-test", "--run=PPO", "--stop-reward=40", "--use-prev-action-reward", "--num-cpus=4"]
)
py_test(
name = "examples/centralized_critic_tf",
main = "examples/centralized_critic.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/centralized_critic.py"],
args = ["--stop=2000"]
args = ["--as-test", "--stop-reward=7.2"]
)
py_test(
name = "examples/centralized_critic_2",
name = "examples/centralized_critic_torch",
main = "examples/centralized_critic.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/centralized_critic.py"],
args = ["--as-test", "--torch", "--stop-reward=7.2"]
)
py_test(
name = "examples/centralized_critic_2_tf",
main = "examples/centralized_critic_2.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/centralized_critic_2.py"],
args = ["--stop=2000"]
args = ["--as-test", "--stop-reward=6.0"]
)
py_test(
name = "examples/custom_eval", main = "examples/custom_eval.py",
name = "examples/centralized_critic_2_torch",
main = "examples/centralized_critic_2.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_eval.py"],
args = ["--custom-eval", "--num-cpus=4"]
srcs = ["examples/centralized_critic_2.py"],
args = ["--as-test", "--torch", "--stop-reward=6.0"]
)
py_test(
name = "examples/custom_keras_model_a2c", main="examples/custom_keras_model.py",
name = "examples/custom_env_tf",
main = "examples/custom_env.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_env.py"],
args = ["--as-test"]
)
py_test(
name = "examples/custom_env_torch",
main = "examples/custom_env.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_env.py"],
args = ["--as-test", "--torch"]
)
py_test(
name = "examples/custom_eval_tf",
main = "examples/custom_eval.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_eval.py"],
args = ["--num-cpus=4"]
)
py_test(
name = "examples/custom_eval_torch",
main = "examples/custom_eval.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_eval.py"],
args = ["--torch", "--num-cpus=4"]
)
py_test(
name = "examples/custom_fast_model_tf",
main = "examples/custom_fast_model.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_fast_model.py"],
args = ["--stop-iters=1", "--num-cpus=4"]
)
py_test(
name = "examples/custom_fast_model_torch",
main = "examples/custom_fast_model.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_fast_model.py"],
args = ["--torch", "--stop-iters=1", "--num-cpus=4"]
)
py_test(
name = "examples/custom_keras_model_a2c",
main = "examples/custom_keras_model.py",
tags = ["examples", "examples_C"],
size = "large",
srcs = ["examples/custom_keras_model.py"],
@@ -1549,7 +1669,8 @@ py_test(
)
py_test(
name = "examples/custom_keras_model_dqn", main="examples/custom_keras_model.py",
name = "examples/custom_keras_model_dqn",
main = "examples/custom_keras_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_keras_model.py"],
@@ -1557,7 +1678,8 @@ py_test(
)
py_test(
name = "examples/custom_keras_model_ppo", main="examples/custom_keras_model.py",
name = "examples/custom_keras_model_ppo",
main = "examples/custom_keras_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_keras_model.py"],
@@ -1565,46 +1687,79 @@ py_test(
)
py_test(
name = "examples/custom_keras_rnn_model_repeat_after_me", main = "examples/custom_keras_rnn_model.py",
tags = ["examples", "examples_C"],
size = "large",
srcs = ["examples/custom_keras_rnn_model.py"],
args = ["--run=PPO", "--stop=50", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
)
py_test(
name = "examples/custom_keras_rnn_model_repeat_initial",
main = "examples/custom_keras_rnn_model.py",
tags = ["examples", "examples_C"],
size = "large",
srcs = ["examples/custom_keras_rnn_model.py"],
args = ["--run=PPO", "--stop=50", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
)
py_test(
name = "examples/custom_loss",
name = "examples/custom_loss_tf",
main = "examples/custom_loss.py",
tags = ["examples", "examples_C"],
size = "small",
# Include the json data file.
data = glob(["tests/data/cartpole_small/**"]),
srcs = ["examples/custom_loss.py"],
args = ["--iters=2", "--input-files=tests/data/cartpole_small"]
args = ["--stop-iters=2", "--input-files=tests/data/cartpole_small"]
)
py_test(
name = "examples/custom_loss_torch",
main = "examples/custom_loss.py",
tags = ["examples", "examples_C"],
size = "small",
# Include the json data file.
data = glob(["tests/data/cartpole_small/**"]),
srcs = ["examples/custom_loss.py"],
args = ["--torch", "--stop-iters=2", "--input-files=tests/data/cartpole_small"]
)
py_test(
name = "examples/custom_metrics_and_callbacks",
main = "examples/custom_metrics_and_callbacks.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_metrics_and_callbacks.py"],
args = ["--num-iters=2"]
args = ["--stop-iters=2"]
)
py_test(
name = "examples/custom_metrics_and_callbacks_legacy",
main = "examples/custom_metrics_and_callbacks_legacy.py",
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_metrics_and_callbacks_legacy.py"],
args = ["--num-iters=2"]
args = ["--stop-iters=2"]
)
py_test(
name = "examples/custom_rnn_model_repeat_after_me_tf",
main = "examples/custom_rnn_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_rnn_model.py"],
args = ["--as-test", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
)
py_test(
name = "examples/custom_rnn_model_repeat_initial_obs_tf",
main = "examples/custom_rnn_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_rnn_model.py"],
args = ["--as-test", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
)
py_test(
name = "examples/custom_rnn_model_repeat_after_me_torch",
main = "examples/custom_rnn_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_rnn_model.py"],
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
)
py_test(
name = "examples/custom_rnn_model_repeat_initial_obs_torch",
main = "examples/custom_rnn_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_rnn_model.py"],
args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
)
py_test(
@@ -1612,16 +1767,7 @@ py_test(
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_tf_policy.py"],
args = ["--iters=2", "--num-cpus=4"]
)
py_test(
name = "examples/custom_torch_rnn_model",
main = "examples/custom_torch_rnn_model.py",
tags = ["examples", "examples_C"],
size = "medium",
srcs = ["examples/custom_torch_rnn_model.py"],
args = ["--run=PPO", "--stop=90", "--num-cpus=4"]
args = ["--stop-iters=2", "--num-cpus=4"]
)
py_test(
@@ -1629,7 +1775,7 @@ py_test(
tags = ["examples", "examples_C"],
size = "small",
srcs = ["examples/custom_torch_policy.py"],
args = ["--iters=2", "--num-cpus=4"]
args = ["--stop-iters=2", "--num-cpus=4"]
)
py_test(
@@ -1637,90 +1783,151 @@ py_test(
tags = ["examples", "examples_E"],
size = "small",
srcs = ["examples/eager_execution.py"],
args = ["--iters=2"]
args = ["--stop-iters=2"]
)
py_test(
name = "examples/hierarchical_training_tf",
main = "examples/hierarchical_training.py",
tags = ["examples", "examples_H"],
size = "small",
size = "medium",
srcs = ["examples/hierarchical_training.py"],
args = ["--stop-reward=0.0"]
)
py_test(
name = "examples/hierarchical_training_torch",
main = "examples/hierarchical_training.py",
tags = ["examples", "examples_H"],
size = "small",
size = "medium",
srcs = ["examples/hierarchical_training.py"],
args = ["--torch", "--stop-reward=0.0"]
)
# Do not run this test (MobileNetV2 is gigantic and takes forever for 1 iter).
# py_test(
# name = "examples/mobilenet_v2_with_lstm_tf",
# main = "examples/mobilenet_v2_with_lstm.py",
# tags = ["examples", "examples_M"],
# size = "small",
# srcs = ["examples/mobilenet_v2_with_lstm.py"]
# )
py_test(
name = "examples/multi_agent_cartpole",
name = "examples/multi_agent_cartpole_tf",
main = "examples/multi_agent_cartpole.py",
tags = ["examples", "examples_M"],
size = "medium",
srcs = ["examples/multi_agent_cartpole.py"],
args = ["--num-iters=2", "--num-cpus=4"]
args = ["--as-test", "--stop-reward=70.0", "--num-cpus=4"]
)
py_test(
name = "examples/multi_agent_custom_policy",
name = "examples/multi_agent_cartpole_torch",
main = "examples/multi_agent_cartpole.py",
tags = ["examples", "examples_M"],
size = "medium",
srcs = ["examples/multi_agent_custom_policy.py"],
size = "small",
srcs = ["examples/multi_agent_cartpole.py"],
args = ["--as-test", "--torch", "--stop-reward=70.0", "--num-cpus=4"]
)
py_test(
name = "examples/multi_agent_two_trainers",
name = "examples/multi_agent_custom_policy_tf",
main = "examples/multi_agent_custom_policy.py",
tags = ["examples", "examples_M"],
size = "small",
srcs = ["examples/multi_agent_custom_policy.py"],
args = ["--as-test", "--stop-reward=80"]
)
py_test(
name = "examples/multi_agent_custom_policy_torch",
main = "examples/multi_agent_custom_policy.py",
tags = ["examples", "examples_M"],
size = "small",
srcs = ["examples/multi_agent_custom_policy.py"],
args = ["--as-test", "--torch", "--stop-reward=80"]
)
py_test(
name = "examples/multi_agent_two_trainers_tf",
main = "examples/multi_agent_two_trainers.py",
tags = ["examples", "examples_M"],
size = "medium",
srcs = ["examples/multi_agent_two_trainers.py"],
args = ["--num-iters=2"]
args = ["--as-test", "--stop-reward=70"]
)
py_test(
name = "examples/two_trainer_workflow",
tags = ["examples", "examples_T"],
name = "examples/multi_agent_two_trainers_torch",
main = "examples/multi_agent_two_trainers.py",
tags = ["examples", "examples_M"],
size = "medium",
srcs = ["examples/two_trainer_workflow.py"],
args = ["--num-iters=2"]
srcs = ["examples/multi_agent_two_trainers.py"],
args = ["--as-test", "--torch", "--stop-reward=70"]
)
py_test(
name = "examples/nested_action_spaces_ppo",
name = "examples/multi_agent_two_trainers_mixed_torch_tf",
main = "examples/multi_agent_two_trainers.py",
tags = ["examples", "examples_M"],
size = "small",
srcs = ["examples/multi_agent_two_trainers.py"],
args = ["--as-test", "--mixed-torch-tf", "--stop-reward=70"]
)
py_test(
name = "examples/nested_action_spaces_ppo_tf",
main = "examples/nested_action_spaces.py",
tags = ["examples", "examples_N"],
size = "medium",
srcs = ["examples/nested_action_spaces.py"],
args = ["--stop=-500", "--run=PPO"]
args = ["--as-test", "--stop-reward=-600", "--run=PPO"]
)
py_test(
name = "examples/parametric_actions_cartpole_pg",
name = "examples/nested_action_spaces_ppo_torch",
main = "examples/nested_action_spaces.py",
tags = ["examples", "examples_N"],
size = "medium",
srcs = ["examples/nested_action_spaces.py"],
args = ["--as-test", "--torch", "--stop-reward=-600", "--run=PPO"]
)
py_test(
name = "examples/parametric_actions_cartpole_pg_tf",
main = "examples/parametric_actions_cartpole.py",
tags = ["examples", "examples_P"],
size = "medium",
srcs = ["examples/parametric_actions_cartpole.py"],
args = ["--run=PG", "--stop=50"]
args = ["--as-test", "--stop-reward=60.0", "--run=PG"]
)
py_test(
name = "examples/parametric_actions_cartpole_ppo",
name = "examples/parametric_actions_cartpole_dqn_tf",
main = "examples/parametric_actions_cartpole.py",
tags = ["examples", "examples_P"],
size = "medium",
srcs = ["examples/parametric_actions_cartpole.py"],
args = ["--run=PPO", "--stop=50"]
args = ["--as-test", "--stop-reward=60.0", "--run=DQN"]
)
py_test(
name = "examples/parametric_actions_cartpole_dqn",
name = "examples/parametric_actions_cartpole_pg_torch",
main = "examples/parametric_actions_cartpole.py",
tags = ["examples", "examples_P"],
size = "small",
srcs = ["examples/parametric_actions_cartpole.py"],
args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=PG"]
)
py_test(
name = "examples/parametric_actions_cartpole_dqn_torch",
main = "examples/parametric_actions_cartpole.py",
tags = ["examples", "examples_P"],
size = "medium",
srcs = ["examples/parametric_actions_cartpole.py"],
args = ["--run=DQN", "--stop=50"]
args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=DQN"]
)
py_test(
@@ -1731,9 +1938,27 @@ py_test(
args = ["--num-cpus=4"]
)
py_test(
name = "examples/rock_paper_scissors_multiagent_tf",
main = "examples/rock_paper_scissors_multiagent.py",
tags = ["examples", "examples_R"],
size = "medium",
srcs = ["examples/rock_paper_scissors_multiagent.py"],
args = ["--as-test"],
)
py_test(
name = "examples/rock_paper_scissors_multiagent_torch",
main = "examples/rock_paper_scissors_multiagent.py",
tags = ["examples", "examples_R"],
size = "medium",
srcs = ["examples/rock_paper_scissors_multiagent.py"],
args = ["--as-test", "--torch"],
)
sh_test(
name = "examples/serving/test_local_inference",
tags = ["examples", "examples_L", "exclusive"],
tags = ["examples", "examples_S", "exclusive"],
size = "medium",
srcs = ["examples/serving/test_local_inference.sh"],
data = glob(["examples/serving/*.py"]),
@@ -1741,27 +1966,82 @@ sh_test(
sh_test(
name = "examples/serving/test_remote_inference",
tags = ["examples", "examples_R", "exclusive"],
tags = ["examples", "examples_S", "exclusive"],
size = "medium",
srcs = ["examples/serving/test_remote_inference.sh"],
data = glob(["examples/serving/*.py"]),
)
py_test(
name = "examples/rock_paper_scissors_multiagent",
main = "examples/rock_paper_scissors_multiagent.py",
tags = ["examples", "examples_R"],
size = "large",
srcs = ["examples/rock_paper_scissors_multiagent.py"],
args = ["--stop=200"],
name = "examples/two_trainer_workflow_tf",
main = "examples/two_trainer_workflow.py",
tags = ["examples", "examples_T"],
size = "small",
srcs = ["examples/two_trainer_workflow.py"],
args = ["--as-test", "--stop-reward=100.0"]
)
py_test(
name = "examples/twostep_game_maddpg", main = "examples/twostep_game.py",
name = "examples/two_trainer_workflow_torch",
main = "examples/two_trainer_workflow.py",
tags = ["examples", "examples_T"],
size = "small",
srcs = ["examples/two_trainer_workflow.py"],
args = ["--as-test", "--torch", "--stop-reward=100.0"]
)
py_test(
name = "examples/two_trainer_workflow_mixed_torch_tf",
main = "examples/two_trainer_workflow.py",
tags = ["examples", "examples_T"],
size = "small",
srcs = ["examples/two_trainer_workflow.py"],
args = ["--as-test", "--mixed-torch-tf", "--stop-reward=100.0"]
)
py_test(
name = "examples/twostep_game_maddpg",
main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "large",
srcs = ["examples/twostep_game.py"],
args = ["--stop=2000", "--run=contrib/MADDPG"]
args = ["--stop-timesteps=2000", "--run=contrib/MADDPG"]
)
py_test(
name = "examples/twostep_game_pg_tf",
main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--as-test", "--stop-reward=7", "--run=PG"]
)
py_test(
name = "examples/twostep_game_pg_torch",
main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--as-test", "--torch", "--stop-reward=7", "--run=PG"]
)
py_test(
name = "examples/twostep_game_qmix",
main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--stop-timesteps=2000", "--run=QMIX"]
)
py_test(
name = "examples/twostep_game_apex_qmix",
main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--stop-timesteps=2000", "--run=APEX_QMIX", "--num-cpus=4"]
)
py_test(
@@ -1770,7 +2050,7 @@ py_test(
tags = ["examples", "examples_T"],
size = "small",
srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
args = ["--stop-at-reward=10", "--run=contrib/LinTS"],
args = ["--as-test", "--stop-reward=10", "--run=contrib/LinTS"],
)
py_test(
@@ -1779,29 +2059,5 @@ py_test(
tags = ["examples", "examples_U"],
size = "small",
srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
args = ["--stop-at-reward=10", "--run=contrib/LinUCB"],
)
py_test(
name = "examples/twostep_game_pg", main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--stop=2000", "--run=PG"]
)
py_test(
name = "examples/twostep_game_qmix", main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--stop=2000", "--run=QMIX"]
)
py_test(
name = "examples/twostep_game_apex_qmix", main = "examples/twostep_game.py",
tags = ["examples", "examples_T"],
size = "medium",
srcs = ["examples/twostep_game.py"],
args = ["--stop=2000", "--run=APEX_QMIX", "--num-cpus=4"]
args = ["--as-test", "--stop-reward=10", "--run=contrib/LinUCB"],
)
+7 -2
View File
@@ -59,8 +59,13 @@ def apply_grad_clipping(policy, optimizer, loss):
info = {}
if policy.config["grad_clip"]:
for param_group in optimizer.param_groups:
info["grad_gnorm"] = nn.utils.clip_grad_norm_(
param_group["params"], policy.config["grad_clip"])
# Make sure we only pass params with grad != None into torch
# clip_grad_norm_. Would fail otherwise.
params = list(
filter(lambda p: p.grad is not None, param_group["params"]))
if params:
info["grad_gnorm"] = nn.utils.clip_grad_norm_(
params, policy.config["grad_clip"])
return info
+3 -3
View File
@@ -45,9 +45,9 @@ class DDPGTorchModel(TorchModelV2, nn.Module):
only defines the layers for the output heads. Those layers for
forward() should be defined in subclasses of DDPGTorchModel.
"""
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
model_config, name)
nn.Module.__init__(self)
super(DDPGTorchModel, self).__init__(obs_space, action_space,
num_outputs, model_config, name)
self.bounded = np.logical_and(action_space.bounded_above,
action_space.bounded_below).any()
@@ -58,7 +58,7 @@ class DDPGTorchModel(TorchModelV2, nn.Module):
# Build the policy network.
self.policy_model = nn.Sequential()
ins = obs_space.shape[-1]
ins = num_outputs
self.obs_ins = ins
activation = get_activation_fn(
actor_hidden_activation, framework="torch")
@@ -1,16 +1,20 @@
"""A very simple contextual bandit example with 3 arms."""
import argparse
import random
import numpy as np
import gym
from gym.spaces import Discrete, Box
import numpy as np
import random
from ray import tune
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--stop-at-reward", type=float, default=10)
parser.add_argument("--run", type=str, default="contrib/LinUCB")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=10.0)
class SimpleContextualBandit(gym.Env):
@@ -37,11 +41,18 @@ class SimpleContextualBandit(gym.Env):
if __name__ == "__main__":
args = parser.parse_args()
tune.run(
args.run,
stop={
"episode_reward_mean": args.stop_at_reward,
},
config={
"env": SimpleContextualBandit,
})
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
config = {
"env": SimpleContextualBandit,
}
results = tune.run(args.run, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
+39 -170
View File
@@ -10,187 +10,56 @@ pattern, and a custom action distribution class that leverages that model.
This examples shows both.
"""
from gym.spaces import Discrete, Tuple
import argparse
import ray
from ray import tune
from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv
from ray.rllib.examples.models.autoregressive_action_model import \
AutoregressiveActionModel, TorchAutoregressiveActionModel
from ray.rllib.examples.models.autoregressive_action_dist import \
BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution
from ray.rllib.models.tf.misc import normc_initializer
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils.framework import try_import_tf
tf = try_import_tf()
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO") # try PG, PPO, IMPALA
parser.add_argument("--stop", type=int, default=200)
parser.add_argument("--torch", action="store_true")
parser.add_argument("--num-cpus", type=int, default=0)
class BinaryAutoregressiveOutput(ActionDistribution):
"""Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
@staticmethod
def required_model_output_shape(self, model_config):
return 16 # controls model output feature vector size
def deterministic_sample(self):
# first, sample a1
a1_dist = self._a1_distribution()
a1 = a1_dist.deterministic_sample()
# sample a2 conditioned on a1
a2_dist = self._a2_distribution(a1)
a2 = a2_dist.deterministic_sample()
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
# return the action tuple
return (a1, a2)
def sample(self):
# first, sample a1
a1_dist = self._a1_distribution()
a1 = a1_dist.sample()
# sample a2 conditioned on a1
a2_dist = self._a2_distribution(a1)
a2 = a2_dist.sample()
self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
# return the action tuple
return (a1, a2)
def logp(self, actions):
a1, a2 = actions[:, 0], actions[:, 1]
a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec])
return (
Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2))
def sampled_action_logp(self):
return tf.exp(self._action_logp)
def entropy(self):
a1_dist = self._a1_distribution()
a2_dist = self._a2_distribution(a1_dist.sample())
return a1_dist.entropy() + a2_dist.entropy()
def kl(self, other):
a1_dist = self._a1_distribution()
a1_terms = a1_dist.kl(other._a1_distribution())
a1 = a1_dist.sample()
a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1))
return a1_terms + a2_terms
def _a1_distribution(self):
BATCH = tf.shape(self.inputs)[0]
a1_logits, _ = self.model.action_model(
[self.inputs, tf.zeros((BATCH, 1))])
a1_dist = Categorical(a1_logits)
return a1_dist
def _a2_distribution(self, a1):
a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
_, a2_logits = self.model.action_model([self.inputs, a1_vec])
a2_dist = Categorical(a2_logits)
return a2_dist
class AutoregressiveActionsModel(TFModelV2):
"""Implements the `.action_model` branch required above."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super(AutoregressiveActionsModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name)
if action_space != Tuple([Discrete(2), Discrete(2)]):
raise ValueError(
"This model only supports the [2, 2] action space")
# Inputs
obs_input = tf.keras.layers.Input(
shape=obs_space.shape, name="obs_input")
a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input")
ctx_input = tf.keras.layers.Input(
shape=(num_outputs, ), name="ctx_input")
# Output of the model (normally 'logits', but for an autoregressive
# dist this is more like a context/feature layer encoding the obs)
context = tf.keras.layers.Dense(
num_outputs,
name="hidden",
activation=tf.nn.tanh,
kernel_initializer=normc_initializer(1.0))(obs_input)
# V(s)
value_out = tf.keras.layers.Dense(
1,
name="value_out",
activation=None,
kernel_initializer=normc_initializer(0.01))(context)
# P(a1 | obs)
a1_logits = tf.keras.layers.Dense(
2,
name="a1_logits",
activation=None,
kernel_initializer=normc_initializer(0.01))(ctx_input)
# P(a2 | a1)
# --note: typically you'd want to implement P(a2 | a1, obs) as follows:
# a2_context = tf.keras.layers.Concatenate(axis=1)(
# [ctx_input, a1_input])
a2_context = a1_input
a2_hidden = tf.keras.layers.Dense(
16,
name="a2_hidden",
activation=tf.nn.tanh,
kernel_initializer=normc_initializer(1.0))(a2_context)
a2_logits = tf.keras.layers.Dense(
2,
name="a2_logits",
activation=None,
kernel_initializer=normc_initializer(0.01))(a2_hidden)
# Base layers
self.base_model = tf.keras.Model(obs_input, [context, value_out])
self.register_variables(self.base_model.variables)
self.base_model.summary()
# Autoregressive action sampler
self.action_model = tf.keras.Model([ctx_input, a1_input],
[a1_logits, a2_logits])
self.action_model.summary()
self.register_variables(self.action_model.variables)
def forward(self, input_dict, state, seq_lens):
context, self._value_out = self.base_model(input_dict["obs"])
return context, state
def value_function(self):
return tf.reshape(self._value_out, [-1])
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=200)
if __name__ == "__main__":
args = parser.parse_args()
ray.init(num_cpus=args.num_cpus or None)
ModelCatalog.register_custom_model("autoregressive_model",
AutoregressiveActionsModel)
ModelCatalog.register_custom_action_dist("binary_autoreg_output",
BinaryAutoregressiveOutput)
tune.run(
args.run,
stop={"episode_reward_mean": args.stop},
config={
"env": CorrelatedActionsEnv,
"gamma": 0.5,
"num_gpus": 0,
"model": {
"custom_model": "autoregressive_model",
"custom_action_dist": "binary_autoreg_output",
},
})
ModelCatalog.register_custom_model(
"autoregressive_model", TorchAutoregressiveActionModel
if args.torch else AutoregressiveActionModel)
ModelCatalog.register_custom_action_dist(
"binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
if args.torch else BinaryAutoregressiveDistribution)
config = {
"env": CorrelatedActionsEnv,
"gamma": 0.5,
"num_gpus": 0,
"model": {
"custom_model": "autoregressive_model",
"custom_action_dist": "binary_autoreg_dist",
},
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run, stop=stop, config=config)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+24 -136
View File
@@ -4,148 +4,28 @@ import argparse
import ray
from ray import tune
from ray.rllib.examples.models.batch_norm_model import BatchNormModel, \
TorchBatchNormModel
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.misc import normc_initializer
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.annotations import override
from ray.rllib.utils.test_utils import check_learning_achieved
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=200)
parser.add_argument("--run", type=str, default="PPO")
class BatchNormModel(TFModelV2):
"""Example of a TFModelV2 that is built w/o using tf.keras.
NOTE: This example does not work when using a keras-based TFModelV2 due
to a bug in keras related to missing values for input placeholders, even
though these input values have been provided in a forward pass through the
actual keras Model.
All Model logic (layers) is defined in the `forward` method (incl.
the batch_normalization layers). Also, all variables are registered
(only once) at the end of `forward`, so an optimizer knows which tensors
to train on. A standard `value_function` override is used.
"""
capture_index = 0
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super().__init__(obs_space, action_space, num_outputs, model_config,
name)
# Have we registered our vars yet (see `forward`)?
self._registered = False
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
last_layer = input_dict["obs"]
hiddens = [256, 256]
with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
for i, size in enumerate(hiddens):
last_layer = tf.layers.dense(
last_layer,
size,
kernel_initializer=normc_initializer(1.0),
activation=tf.nn.tanh,
name="fc{}".format(i))
# Add a batch norm layer
last_layer = tf.layers.batch_normalization(
last_layer,
training=input_dict["is_training"],
name="bn_{}".format(i))
output = tf.layers.dense(
last_layer,
self.num_outputs,
kernel_initializer=normc_initializer(0.01),
activation=None,
name="out")
self._value_out = tf.layers.dense(
last_layer,
1,
kernel_initializer=normc_initializer(1.0),
activation=None,
name="vf")
if not self._registered:
self.register_variables(
tf.get_collection(
tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
self._registered = True
return output, []
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
class KerasBatchNormModel(TFModelV2):
"""Keras version of above BatchNormModel with exactly the same structure.
IMORTANT NOTE: This model will not work with PPO due to a bug in keras
that surfaces when having more than one input placeholder (here: `inputs`
and `is_training`) AND using the `make_tf_callable` helper (e.g. used by
PPO), in which auto-placeholders are generated, then passed through the
tf.keras. models.Model. In this last step, the connection between 1) the
provided value in the auto-placeholder and 2) the keras `is_training`
Input is broken and keras complains.
Use the above `BatchNormModel` (a non-keras based TFModelV2), instead.
"""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super().__init__(obs_space, action_space, num_outputs, model_config,
name)
inputs = tf.keras.layers.Input(shape=obs_space.shape, name="inputs")
is_training = tf.keras.layers.Input(
shape=(), dtype=tf.bool, batch_size=1, name="is_training")
last_layer = inputs
hiddens = [256, 256]
for i, size in enumerate(hiddens):
label = "fc{}".format(i)
last_layer = tf.keras.layers.Dense(
units=size,
kernel_initializer=normc_initializer(1.0),
activation=tf.nn.tanh,
name=label)(last_layer)
# Add a batch norm layer
last_layer = tf.keras.layers.BatchNormalization()(
last_layer, training=is_training[0])
output = tf.keras.layers.Dense(
units=self.num_outputs,
kernel_initializer=normc_initializer(0.01),
activation=None,
name="fc_out")(last_layer)
value_out = tf.keras.layers.Dense(
units=1,
kernel_initializer=normc_initializer(0.01),
activation=None,
name="value_out")(last_layer)
self.base_model = tf.keras.models.Model(
inputs=[inputs, is_training], outputs=[output, value_out])
self.register_variables(self.base_model.variables)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
out, self._value_out = self.base_model(
[input_dict["obs"], input_dict["is_training"]])
return out, []
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=150)
if __name__ == "__main__":
args = parser.parse_args()
ray.init()
ray.init(local_mode=True)
ModelCatalog.register_custom_model("bn_model", BatchNormModel)
ModelCatalog.register_custom_model(
"bn_model", TorchBatchNormModel if args.torch else BatchNormModel)
config = {
"env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
@@ -153,10 +33,18 @@ if __name__ == "__main__":
"custom_model": "bn_model",
},
"num_workers": 0,
"use_pytorch": args.torch,
}
tune.run(
args.run,
stop={"training_iteration": args.num_iters},
config=config,
)
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run, stop=stop, config=config)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+28 -16
View File
@@ -1,13 +1,17 @@
import argparse
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=200)
parser.add_argument("--torch", action="store_true")
parser.add_argument("--use-prev-action-reward", action="store_true")
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--use-prev-action-reward", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=150.0)
if __name__ == "__main__":
import ray
@@ -30,16 +34,24 @@ if __name__ == "__main__":
},
}
tune.run(
args.run,
stop={"episode_reward_mean": args.stop},
config=dict(
configs[args.run], **{
"env": StatelessCartPole,
"model": {
"use_lstm": True,
"lstm_use_prev_action_reward": args.use_prev_action_reward,
},
"use_pytorch": args.torch,
}),
)
config = dict(
configs[args.run], **{
"env": StatelessCartPole,
"model": {
"use_lstm": True,
"lstm_use_prev_action_reward": args.use_prev_action_reward,
},
"use_pytorch": args.torch,
})
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+103 -79
View File
@@ -16,77 +16,53 @@ import argparse
import numpy as np
from gym.spaces import Discrete
import ray
from ray import tune
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, KLCoeffMixin, \
PPOLoss
PPOLoss as TFLoss
from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy, \
KLCoeffMixin as TorchKLCoeffMixin, PPOLoss as TorchLoss
from ray.rllib.evaluation.postprocessing import compute_advantages, \
Postprocessing
from ray.rllib.examples.twostep_game import TwoStepGame
from ray.rllib.examples.env.two_step_game import TwoStepGame
from ray.rllib.examples.models.centralized_critic_models import \
CentralizedCriticModel, TorchCentralizedCriticModel
from ray.rllib.models import ModelCatalog
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy import LearningRateSchedule, \
EntropyCoeffSchedule
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.policy.torch_policy import LearningRateSchedule as TorchLR, \
EntropyCoeffSchedule as TorchEntropyCoeffSchedule
from ray.rllib.utils.explained_variance import explained_variance
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.rllib.utils.tf_ops import make_tf_callable
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.torch_ops import convert_to_torch_tensor
tf = try_import_tf()
torch, nn = try_import_torch()
OPPONENT_OBS = "opponent_obs"
OPPONENT_ACTION = "opponent_action"
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=100000)
class CentralizedCriticModel(TFModelV2):
"""Multi-agent model that implements a centralized VF."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super(CentralizedCriticModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name)
# Base of the model
self.model = FullyConnectedNetwork(obs_space, action_space,
num_outputs, model_config, name)
self.register_variables(self.model.variables())
# Central VF maps (obs, opp_obs, opp_act) -> vf_pred
obs = tf.keras.layers.Input(shape=(6, ), name="obs")
opp_obs = tf.keras.layers.Input(shape=(6, ), name="opp_obs")
opp_act = tf.keras.layers.Input(shape=(2, ), name="opp_act")
concat_obs = tf.keras.layers.Concatenate(axis=1)(
[obs, opp_obs, opp_act])
central_vf_dense = tf.keras.layers.Dense(
16, activation=tf.nn.tanh, name="c_vf_dense")(concat_obs)
central_vf_out = tf.keras.layers.Dense(
1, activation=None, name="c_vf_out")(central_vf_dense)
self.central_vf = tf.keras.Model(
inputs=[obs, opp_obs, opp_act], outputs=central_vf_out)
self.register_variables(self.central_vf.variables)
def forward(self, input_dict, state, seq_lens):
return self.model.forward(input_dict, state, seq_lens)
def central_value_function(self, obs, opponent_obs, opponent_actions):
return tf.reshape(
self.central_vf(
[obs, opponent_obs,
tf.one_hot(opponent_actions, 2)]), [-1])
def value_function(self):
return self.model.value_function() # not used
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=100)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=7.99)
class CentralizedValueMixin:
"""Add method to evaluate the central value function from the model."""
def __init__(self):
self.compute_central_vf = make_tf_callable(self.get_session())(
self.model.central_value_function)
if not self.config["use_pytorch"]:
self.compute_central_vf = make_tf_callable(self.get_session())(
self.model.central_value_function)
else:
self.compute_central_vf = self.model.central_value_function
# Grabs the opponent obs/act and includes it in the experience train_batch,
@@ -95,7 +71,9 @@ def centralized_critic_postprocessing(policy,
sample_batch,
other_agent_batches=None,
episode=None):
if policy.loss_initialized():
pytorch = policy.config["use_pytorch"]
if (pytorch and hasattr(policy, "compute_central_vf")) or \
(not pytorch and policy.loss_initialized()):
assert other_agent_batches is not None
[(_, opponent_batch)] = list(other_agent_batches.values())
@@ -104,11 +82,18 @@ def centralized_critic_postprocessing(policy,
sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]
# overwrite default VF prediction with the central VF
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
sample_batch[OPPONENT_ACTION])
if args.torch:
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
convert_to_torch_tensor(sample_batch[SampleBatch.CUR_OBS]),
convert_to_torch_tensor(sample_batch[OPPONENT_OBS]),
convert_to_torch_tensor(sample_batch[OPPONENT_ACTION])). \
detach().numpy()
else:
sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
sample_batch[OPPONENT_ACTION])
else:
# policy hasn't initialized yet, use zeros
# Policy hasn't been initialized yet, use zeros.
sample_batch[OPPONENT_OBS] = np.zeros_like(
sample_batch[SampleBatch.CUR_OBS])
sample_batch[OPPONENT_ACTION] = np.zeros_like(
@@ -141,7 +126,13 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
train_batch[SampleBatch.CUR_OBS], train_batch[OPPONENT_OBS],
train_batch[OPPONENT_ACTION])
policy.loss_obj = PPOLoss(
func = TFLoss if not policy.config["use_pytorch"] else TorchLoss
adv = tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) \
if not policy.config["use_pytorch"] else \
torch.ones_like(train_batch[Postprocessing.ADVANTAGES],
dtype=torch.bool)
policy.loss_obj = func(
dist_class,
model,
train_batch[Postprocessing.VALUE_TARGETS],
@@ -153,7 +144,7 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
action_dist,
policy.central_value_out,
policy.kl_coeff,
tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool),
adv,
entropy_coeff=policy.entropy_coeff,
clip_param=policy.config["clip_param"],
vf_clip_param=policy.config["vf_clip_param"],
@@ -180,8 +171,8 @@ def central_vf_stats(policy, train_batch, grads):
}
CCPPO = PPOTFPolicy.with_updates(
name="CCPPO",
CCPPOTFPolicy = PPOTFPolicy.with_updates(
name="CCPPOTFPolicy",
postprocess_fn=centralized_critic_postprocessing,
loss_fn=loss_with_central_critic,
before_loss_init=setup_mixins,
@@ -191,31 +182,64 @@ CCPPO = PPOTFPolicy.with_updates(
CentralizedValueMixin
])
CCPPOTorchPolicy = PPOTorchPolicy.with_updates(
name="CCPPOTorchPolicy",
postprocess_fn=centralized_critic_postprocessing,
loss_fn=loss_with_central_critic,
before_init=setup_mixins,
mixins=[
TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin,
CentralizedValueMixin
])
def get_policy_class(config):
return CCPPOTorchPolicy if config["use_pytorch"] else CCPPOTFPolicy
CCTrainer = PPOTrainer.with_updates(
name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None)
name="CCPPOTrainer",
default_policy=CCPPOTFPolicy,
get_policy_class=get_policy_class,
)
if __name__ == "__main__":
ray.init(local_mode=True)
args = parser.parse_args()
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
tune.run(
CCTrainer,
stop={
"timesteps_total": args.stop,
"episode_reward_mean": 7.99,
ModelCatalog.register_custom_model(
"cc_model", TorchCentralizedCriticModel
if args.torch else CentralizedCriticModel)
config = {
"env": TwoStepGame,
"batch_mode": "complete_episodes",
"eager": False,
"num_workers": 0,
"multiagent": {
"policies": {
"pol1": (None, Discrete(6), TwoStepGame.action_space, {
"use_pytorch": args.torch
}),
"pol2": (None, Discrete(6), TwoStepGame.action_space, {
"use_pytorch": args.torch
}),
},
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
},
config={
"env": TwoStepGame,
"batch_mode": "complete_episodes",
"eager": False,
"num_workers": 0,
"multiagent": {
"policies": {
"pol1": (None, Discrete(6), TwoStepGame.action_space, {}),
"pol2": (None, Discrete(6), TwoStepGame.action_space, {}),
},
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
},
"model": {
"custom_model": "cc_model",
},
})
"model": {
"custom_model": "cc_model",
},
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(CCTrainer, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
+44 -73
View File
@@ -10,64 +10,24 @@ modifies the policy to add a centralized value function.
"""
import numpy as np
from gym.spaces import Box, Dict, Discrete
from gym.spaces import Dict, Discrete
import argparse
from ray import tune
from ray.rllib.agents.callbacks import DefaultCallbacks
from ray.rllib.examples.twostep_game import TwoStepGame
from ray.rllib.examples.models.centralized_critic_models import \
YetAnotherCentralizedCriticModel, YetAnotherTorchCentralizedCriticModel
from ray.rllib.examples.env.two_step_game import TwoStepGame
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=100000)
class CentralizedCriticModel(TFModelV2):
"""Multi-agent model that implements a centralized VF.
It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the
former of which can be used for computing actions (i.e., decentralized
execution), and the latter for optimization (i.e., centralized learning).
This model has two parts:
- An action model that looks at just 'own_obs' to compute actions
- A value model that also looks at the 'opponent_obs' / 'opponent_action'
to compute the value (it does this by using the 'obs_flat' tensor).
"""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super(CentralizedCriticModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name)
self.action_model = FullyConnectedNetwork(
Box(low=0, high=1, shape=(6, )), # one-hot encoded Discrete(6)
action_space,
num_outputs,
model_config,
name + "_action")
self.register_variables(self.action_model.variables())
self.value_model = FullyConnectedNetwork(obs_space, action_space, 1,
model_config, name + "_vf")
self.register_variables(self.value_model.variables())
def forward(self, input_dict, state, seq_lens):
self._value_out, _ = self.value_model({
"obs": input_dict["obs_flat"]
}, state, seq_lens)
return self.action_model({
"obs": input_dict["obs"]["own_obs"]
}, state, seq_lens)
def value_function(self):
return tf.reshape(self._value_out, [-1])
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=100)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=7.99)
class FillInActions(DefaultCallbacks):
@@ -109,7 +69,11 @@ def central_critic_observer(agent_obs, **kw):
if __name__ == "__main__":
args = parser.parse_args()
ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
ModelCatalog.register_custom_model(
"cc_model", YetAnotherTorchCentralizedCriticModel
if args.torch else YetAnotherCentralizedCriticModel)
action_space = Discrete(2)
observer_space = Dict({
"own_obs": Discrete(6),
@@ -118,26 +82,33 @@ if __name__ == "__main__":
"opponent_obs": Discrete(6),
"opponent_action": Discrete(2),
})
tune.run(
"PPO",
stop={
"timesteps_total": args.stop,
"episode_reward_mean": 7.99,
config = {
"env": TwoStepGame,
"batch_mode": "complete_episodes",
"callbacks": FillInActions,
"num_workers": 0,
"multiagent": {
"policies": {
"pol1": (None, observer_space, action_space, {}),
"pol2": (None, observer_space, action_space, {}),
},
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
"observation_fn": central_critic_observer,
},
config={
"env": TwoStepGame,
"batch_mode": "complete_episodes",
"callbacks": FillInActions,
"num_workers": 0,
"multiagent": {
"policies": {
"pol1": (None, observer_space, action_space, {}),
"pol2": (None, observer_space, action_space, {}),
},
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
"observation_fn": central_critic_observer,
},
"model": {
"custom_model": "cc_model",
},
})
"model": {
"custom_model": "cc_model",
},
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run("PPO", config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
+69 -25
View File
@@ -7,20 +7,32 @@ This example shows:
You can visualize experiment results in ~/ray_results using TensorBoard.
"""
import argparse
import numpy as np
import gym
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from gym.spaces import Discrete, Box
import ray
from ray import tune
from ray.rllib.utils import try_import_tf
from ray.tune import grid_search
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.test_utils import check_learning_achieved
tf = try_import_tf()
torch, nn = try_import_torch()
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=50)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=0.1)
class SimpleCorridor(gym.Env):
@@ -46,11 +58,11 @@ class SimpleCorridor(gym.Env):
elif action == 1:
self.cur_pos += 1
done = self.cur_pos >= self.end_pos
return [self.cur_pos], 1 if done else 0, done, {}
return [self.cur_pos], 1.0 if done else -0.1, done, {}
class CustomModel(TFModelV2):
"""Example of a custom model that just delegates to a fc-net."""
"""Example of a keras custom model that just delegates to an fc-net."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
@@ -67,26 +79,58 @@ class CustomModel(TFModelV2):
return self.model.value_function()
class TorchCustomModel(TorchModelV2, nn.Module):
"""Example of a PyTorch custom model that just delegates to a fc-net."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
model_config, name)
nn.Module.__init__(self)
self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs,
model_config, name)
def forward(self, input_dict, state, seq_lens):
input_dict["obs"] = input_dict["obs"].float()
fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
return fc_out, []
def value_function(self):
return torch.reshape(self.torch_sub_model.value_function(), [-1])
if __name__ == "__main__":
args = parser.parse_args()
ray.init()
# Can also register the env creator function explicitly with:
# register_env("corridor", lambda config: SimpleCorridor(config))
ray.init()
ModelCatalog.register_custom_model("my_model", CustomModel)
tune.run(
"PPO",
stop={
"timesteps_total": 10000,
ModelCatalog.register_custom_model(
"my_model", TorchCustomModel if args.torch else CustomModel)
config = {
"env": SimpleCorridor, # or "corridor" if registered above
"env_config": {
"corridor_length": 5,
},
config={
"env": SimpleCorridor, # or "corridor" if registered above
"model": {
"custom_model": "my_model",
},
"vf_share_layers": True,
"lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs
"num_workers": 1, # parallelism
"env_config": {
"corridor_length": 5,
},
"model": {
"custom_model": "my_model",
},
)
"vf_share_layers": True,
"lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs
"num_workers": 1, # parallelism
"use_pytorch": args.torch
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+39 -36
View File
@@ -74,9 +74,9 @@ from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes
from ray.rllib.examples.env.simple_corridor import SimpleCorridor
parser = argparse.ArgumentParser()
parser.add_argument("--custom-eval", action="store_true")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--torch", action="store_true")
parser.add_argument("--no-custom-eval", action="store_true")
def custom_eval_function(trainer, eval_workers):
@@ -124,48 +124,51 @@ def custom_eval_function(trainer, eval_workers):
if __name__ == "__main__":
args = parser.parse_args()
if args.custom_eval:
eval_fn = custom_eval_function
else:
if args.no_custom_eval:
eval_fn = None
else:
eval_fn = custom_eval_function
ray.init(num_cpus=args.num_cpus or None)
tune.run(
"PG",
stop={
"training_iteration": 10,
config = {
"env": SimpleCorridor,
"env_config": {
"corridor_length": 10,
},
config={
"env": SimpleCorridor,
"horizon": 20,
"log_level": "INFO",
# Training rollouts will be collected using just the learner
# process, but evaluation will be done in parallel with two
# workers. Hence, this run will use 3 CPUs total (1 for the
# learner + 2 more for evaluation workers).
"num_workers": 0,
"evaluation_num_workers": 2,
# Optional custom eval function.
"custom_eval_function": eval_fn,
# Enable evaluation, once per training iteration.
"evaluation_interval": 1,
# Run 10 episodes each time evaluation runs.
"evaluation_num_episodes": 10,
# Override the env config for evaluation.
"evaluation_config": {
"env_config": {
"corridor_length": 10,
# Evaluate using LONGER corridor than trained on.
"corridor_length": 5,
},
"horizon": 20,
"log_level": "INFO",
},
"use_pytorch": args.torch,
}
# Training rollouts will be collected using just the learner
# process, but evaluation will be done in parallel with two
# workers. Hence, this run will use 3 CPUs total (1 for the
# learner + 2 more for evaluation workers).
"num_workers": 0,
"evaluation_num_workers": 2,
stop = {
"training_iteration": 10,
}
# Optional custom eval function.
"custom_eval_function": eval_fn,
tune.run("PG", config=config, stop=stop)
# Enable evaluation, once per training iteration.
"evaluation_interval": 1,
# Run 10 episodes each time evaluation runs.
"evaluation_num_episodes": 10,
# Override the env config for evaluation.
"evaluation_config": {
"env_config": {
# Evaluate using LONGER corridor than trained on.
"corridor_length": 5,
},
},
"use_pytorch": args.torch,
})
ray.shutdown()
+43 -39
View File
@@ -4,48 +4,52 @@ Both the model and env are trivial (and super-fast), so they are useful
for running perf microbenchmarks.
"""
import argparse
import ray
import ray.tune as tune
from ray.tune import sample_from
from ray.rllib.examples.env.fast_image_env import FastImageEnv
from ray.rllib.models import Model, ModelCatalog
from ray.tune import run_experiments, sample_from
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
class FastModel(Model):
def _build_layers_v2(self, input_dict, num_outputs, options):
bias = tf.get_variable(
dtype=tf.float32,
name="bias",
initializer=tf.zeros_initializer,
shape=())
output = bias + tf.zeros([tf.shape(input_dict["obs"])[0], num_outputs])
return output, output
from ray.rllib.examples.models.fast_model import FastModel, TorchFastModel
from ray.rllib.models import ModelCatalog
parser = argparse.ArgumentParser()
parser.add_argument("--num-cpus", type=int, default=2)
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
if __name__ == "__main__":
ray.init()
ModelCatalog.register_custom_model("fast_model", FastModel)
run_experiments({
"demo": {
"run": "IMPALA",
"env": FastImageEnv,
"config": {
"compress_observations": True,
"model": {
"custom_model": "fast_model"
},
"num_gpus": 0,
"num_workers": 2,
"num_envs_per_worker": 10,
"num_data_loader_buffers": 1,
"num_aggregation_workers": 1,
"broadcast_interval": 50,
"rollout_fragment_length": 100,
"train_batch_size": sample_from(
lambda spec: 1000 * max(1, spec.config.num_gpus)),
"fake_sampler": True,
},
args = parser.parse_args()
ray.init(num_cpus=args.num_cpus or None)
ModelCatalog.register_custom_model(
"fast_model", TorchFastModel if args.torch else FastModel)
config = {
"env": FastImageEnv,
"compress_observations": True,
"model": {
"custom_model": "fast_model"
},
})
"num_gpus": 0,
"num_workers": 2,
"num_envs_per_worker": 10,
"num_data_loader_buffers": 1,
"num_aggregation_workers": 1,
"broadcast_interval": 50,
"rollout_fragment_length": 100,
"train_batch_size": sample_from(
lambda spec: 1000 * max(1, spec.config.num_gpus)),
"fake_sampler": True,
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
}
tune.run("IMPALA", config=config, stop=stop)
ray.shutdown()
@@ -1,115 +0,0 @@
# Explains/tests Issues:
# https://github.com/ray-project/ray/issues/6928
# https://github.com/ray-project/ray/issues/6732
from gym.spaces import Discrete, Box
import numpy as np
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.examples.random_env import RandomEnv
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.annotations import override
tf = try_import_tf()
cnn_shape = (4, 4, 3)
class CustomModel(RecurrentTFModelV2):
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super(CustomModel, self).__init__(obs_space, action_space, num_outputs,
model_config, name)
self.cell_size = 16
visual_size = cnn_shape[0] * cnn_shape[1] * cnn_shape[2]
state_in_h = tf.keras.layers.Input(shape=(self.cell_size, ), name="h")
state_in_c = tf.keras.layers.Input(shape=(self.cell_size, ), name="c")
seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
inputs = tf.keras.layers.Input(
shape=(None, visual_size), name="visual_inputs")
input_visual = inputs
input_visual = tf.reshape(
input_visual, [-1, cnn_shape[0], cnn_shape[1], cnn_shape[2]])
cnn_input = tf.keras.layers.Input(shape=cnn_shape, name="cnn_input")
cnn_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
alpha=1.0,
include_top=True,
weights=None,
input_tensor=cnn_input,
pooling=None)
vision_out = cnn_model(input_visual)
vision_out = tf.reshape(
vision_out,
[-1, tf.shape(inputs)[1],
vision_out.shape.as_list()[-1]])
lstm_out, state_h, state_c = tf.keras.layers.LSTM(
self.cell_size,
return_sequences=True,
return_state=True,
name="lstm")(
inputs=vision_out,
mask=tf.sequence_mask(seq_in),
initial_state=[state_in_h, state_in_c])
# Postprocess LSTM output with another hidden layer and compute values.
logits = tf.keras.layers.Dense(
self.num_outputs,
activation=tf.keras.activations.linear,
name="logits")(lstm_out)
values = tf.keras.layers.Dense(
1, activation=None, name="values")(lstm_out)
# Create the RNN model
self.rnn_model = tf.keras.Model(
inputs=[inputs, seq_in, state_in_h, state_in_c],
outputs=[logits, values, state_h, state_c])
self.register_variables(self.rnn_model.variables)
self.rnn_model.summary()
@override(RecurrentTFModelV2)
def forward_rnn(self, inputs, state, seq_lens):
model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
state)
return model_out, [h, c]
@override(ModelV2)
def get_initial_state(self):
return [
np.zeros(self.cell_size, np.float32),
np.zeros(self.cell_size, np.float32),
]
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
if __name__ == "__main__":
ModelCatalog.register_custom_model("my_model", CustomModel)
trainer = PPOTrainer(
env=RandomEnv,
config={
# "eager": True, # <- should work for both eager or not
"model": {
"custom_model": "my_model",
"max_seq_len": 20,
},
"vf_share_layers": True,
"num_workers": 0, # no parallelism
"env_config": {
"action_space": Discrete(2),
# Test a simple Tuple observation space.
"observation_space": Box(
0.0, 1.0, shape=cnn_shape, dtype=np.float32)
}
})
trainer.train()
+1 -1
View File
@@ -17,7 +17,7 @@ tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="DQN") # Try PG, PPO, DQN
parser.add_argument("--stop", type=int, default=200)
parser.add_argument("--use_vision_network", action="store_true")
parser.add_argument("--use-vision-network", action="store_true")
parser.add_argument("--num-cpus", type=int, default=0)
-117
View File
@@ -1,117 +0,0 @@
"""Example of using a custom RNN keras model."""
import argparse
import numpy as np
import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
parser.add_argument("--stop", type=int, default=90)
parser.add_argument("--num-cpus", type=int, default=0)
class MyKerasRNN(RecurrentTFModelV2):
"""Example of using the Keras functional API to define a RNN model."""
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
hiddens_size=256,
cell_size=64):
super(MyKerasRNN, self).__init__(obs_space, action_space, num_outputs,
model_config, name)
self.cell_size = cell_size
# Define input layers
input_layer = tf.keras.layers.Input(
shape=(None, obs_space.shape[0]), name="inputs")
state_in_h = tf.keras.layers.Input(shape=(cell_size, ), name="h")
state_in_c = tf.keras.layers.Input(shape=(cell_size, ), name="c")
seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
# Preprocess observation with a hidden layer and send to LSTM cell
dense1 = tf.keras.layers.Dense(
hiddens_size, activation=tf.nn.relu, name="dense1")(input_layer)
lstm_out, state_h, state_c = tf.keras.layers.LSTM(
cell_size, return_sequences=True, return_state=True, name="lstm")(
inputs=dense1,
mask=tf.sequence_mask(seq_in),
initial_state=[state_in_h, state_in_c])
# Postprocess LSTM output with another hidden layer and compute values
logits = tf.keras.layers.Dense(
self.num_outputs,
activation=tf.keras.activations.linear,
name="logits")(lstm_out)
values = tf.keras.layers.Dense(
1, activation=None, name="values")(lstm_out)
# Create the RNN model
self.rnn_model = tf.keras.Model(
inputs=[input_layer, seq_in, state_in_h, state_in_c],
outputs=[logits, values, state_h, state_c])
self.register_variables(self.rnn_model.variables)
self.rnn_model.summary()
@override(RecurrentTFModelV2)
def forward_rnn(self, inputs, state, seq_lens):
model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
state)
return model_out, [h, c]
@override(ModelV2)
def get_initial_state(self):
return [
np.zeros(self.cell_size, np.float32),
np.zeros(self.cell_size, np.float32),
]
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
if __name__ == "__main__":
args = parser.parse_args()
ray.init(num_cpus=args.num_cpus or None)
ModelCatalog.register_custom_model("rnn", MyKerasRNN)
register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())
config = {
"env": args.env,
"env_config": {
"repeat_delay": 2,
},
"gamma": 0.9,
"num_workers": 0,
"num_envs_per_worker": 20,
"entropy_coeff": 0.001,
"num_sgd_iter": 5,
"vf_loss_coeff": 1e-5,
"model": {
"custom_model": "rnn",
"max_seq_len": 20,
},
}
tune.run(
args.run,
config=config,
stop={"episode_reward_mean": args.stop},
)
+23 -65
View File
@@ -16,17 +16,16 @@ import os
import ray
from ray import tune
from ray.rllib.models import Model, ModelCatalog
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork
from ray.rllib.models.model import restore_original_dimensions
from ray.rllib.offline import JsonReader
from ray.rllib.examples.models.custom_loss_model import CustomLossModel, \
TorchCustomLossModel
from ray.rllib.models import ModelCatalog
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--iters", type=int, default=200)
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument(
"--input-files",
type=str,
@@ -34,50 +33,6 @@ parser.add_argument(
os.path.dirname(os.path.abspath(__file__)),
"../tests/data/cartpole_small"))
class CustomLossModel(Model):
"""Custom model that adds an imitation loss on top of the policy loss."""
def _build_layers_v2(self, input_dict, num_outputs, options):
self.obs_in = input_dict["obs"]
with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
self.action_space, num_outputs,
options)
return self.fcnet.outputs, self.fcnet.last_layer
def custom_loss(self, policy_loss, loss_inputs):
# create a new input reader per worker
reader = JsonReader(self.options["custom_options"]["input_files"])
input_ops = reader.tf_input_ops()
# define a secondary loss by building a graph copy with weight sharing
obs = tf.cast(input_ops["obs"], tf.float32)
logits, _ = self._build_layers_v2({
"obs": restore_original_dimensions(obs, self.obs_space)
}, self.num_outputs, self.options)
# You can also add self-supervised losses easily by referencing tensors
# created during _build_layers_v2(). For example, an autoencoder-style
# loss can be added as follows:
# ae_loss = squared_diff(
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
# compute the IL loss
action_dist = Categorical(logits, self.options)
self.policy_loss = policy_loss
self.imitation_loss = tf.reduce_mean(
-action_dist.logp(input_ops["actions"]))
return policy_loss + 10 * self.imitation_loss
def custom_stats(self):
return {
"policy_loss": self.policy_loss,
"imitation_loss": self.imitation_loss,
}
if __name__ == "__main__":
ray.init()
args = parser.parse_args()
@@ -90,20 +45,23 @@ if __name__ == "__main__":
input_dir = rllib_dir.absolute().joinpath(args.input_files)
args.input_files = str(input_dir)
ModelCatalog.register_custom_model("custom_loss", CustomLossModel)
tune.run(
"PG",
stop={
"training_iteration": args.iters,
},
config={
"env": "CartPole-v0",
"num_workers": 0,
"model": {
"custom_model": "custom_loss",
"custom_options": {
"input_files": args.input_files,
},
ModelCatalog.register_custom_model(
"custom_loss", TorchCustomLossModel if args.torch else CustomLossModel)
config = {
"env": "CartPole-v0",
"num_workers": 0,
"model": {
"custom_model": "custom_loss",
"custom_options": {
"input_files": args.input_files,
},
},
)
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
}
tune.run("PG", config=config, stop=stop)
@@ -64,14 +64,14 @@ class MyCallbacks(DefaultCallbacks):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=2000)
parser.add_argument("--stop-iters", type=int, default=2000)
args = parser.parse_args()
ray.init()
trials = tune.run(
"PG",
stop={
"training_iteration": args.num_iters,
"training_iteration": args.stop_iters,
},
config={
"env": "CartPole-v0",
@@ -53,14 +53,14 @@ def on_postprocess_traj(info):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=2000)
parser.add_argument("--stop-iters", type=int, default=2000)
args = parser.parse_args()
ray.init()
trials = tune.run(
"PG",
stop={
"training_iteration": args.num_iters,
"training_iteration": args.stop_iters,
},
config={
"env": "CartPole-v0",
+62
View File
@@ -0,0 +1,62 @@
"""Example of using a custom RNN keras model."""
import argparse
import ray
from ray import tune
from ray.tune.registry import register_env
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
from ray.rllib.examples.models.rnn_model import RNNModel, TorchRNNModel
from ray.rllib.models import ModelCatalog
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-reward", type=float, default=90)
parser.add_argument("--stop-iters", type=int, default=100)
parser.add_argument("--stop-timesteps", type=int, default=100000)
if __name__ == "__main__":
args = parser.parse_args()
ray.init(num_cpus=args.num_cpus or None)
ModelCatalog.register_custom_model(
"rnn", TorchRNNModel if args.torch else RNNModel)
register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())
config = {
"env": args.env,
"env_config": {
"repeat_delay": 2,
},
"gamma": 0.9,
"num_workers": 0,
"num_envs_per_worker": 20,
"entropy_coeff": 0.001,
"num_sgd_iter": 5,
"vf_loss_coeff": 1e-5,
"model": {
"custom_model": "rnn",
"max_seq_len": 20,
},
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+2 -2
View File
@@ -10,7 +10,7 @@ from ray.rllib.utils import try_import_tf
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--iters", type=int, default=200)
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--num-cpus", type=int, default=0)
@@ -47,7 +47,7 @@ if __name__ == "__main__":
ray.init(num_cpus=args.num_cpus or None)
tune.run(
MyTrainer,
stop={"training_iteration": args.iters},
stop={"training_iteration": args.stop_iters},
config={
"env": "CartPole-v0",
"num_workers": 2,
+2 -2
View File
@@ -7,7 +7,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.torch_policy_template import build_torch_policy
parser = argparse.ArgumentParser()
parser.add_argument("--iters", type=int, default=200)
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--num-cpus", type=int, default=0)
@@ -33,7 +33,7 @@ if __name__ == "__main__":
ray.init(num_cpus=args.num_cpus or None)
tune.run(
MyTrainer,
stop={"training_iteration": args.iters},
stop={"training_iteration": args.stop_iters},
config={
"env": "CartPole-v0",
"num_workers": 2,
-128
View File
@@ -1,128 +0,0 @@
import argparse
import ray
from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.models.torch.recurrent_torch_model import RecurrentTorchModel
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.utils.annotations import override
from ray.rllib.utils import try_import_torch
from ray.rllib.models import ModelCatalog
import ray.tune as tune
torch, nn = try_import_torch()
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--env", type=str, default="repeat_initial")
parser.add_argument("--stop", type=int, default=90)
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--fc-size", type=int, default=64)
parser.add_argument("--lstm-cell-size", type=int, default=256)
class RNNModel(RecurrentTorchModel):
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
fc_size=64,
lstm_state_size=256):
super().__init__(obs_space, action_space, num_outputs, model_config,
name)
self.obs_size = get_preprocessor(obs_space)(obs_space).size
self.fc_size = fc_size
self.lstm_state_size = lstm_state_size
# Build the Module from fc + LSTM + 2xfc (action + value outs).
self.fc1 = nn.Linear(self.obs_size, self.fc_size)
self.lstm = nn.LSTM(
self.fc_size, self.lstm_state_size, batch_first=True)
self.action_branch = nn.Linear(self.lstm_state_size, num_outputs)
self.value_branch = nn.Linear(self.lstm_state_size, 1)
# Store the value output to save an extra forward pass.
self._cur_value = None
@override(ModelV2)
def get_initial_state(self):
# make hidden states on same device as model
h = [
self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0),
self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0)
]
return h
@override(ModelV2)
def value_function(self):
assert self._cur_value is not None, "must call forward() first"
return self._cur_value
@override(RecurrentTorchModel)
def forward_rnn(self, inputs, state, seq_lens):
"""Feeds `inputs` (B x T x ..) through the Gru Unit.
Returns the resulting outputs as a sequence (B x T x ...).
Values are stored in self._cur_value in simple (B) shape (where B
contains both the B and T dims!).
Returns:
NN Outputs (B x T x ...) as sequence.
The state batches as a List of two items (c- and h-states).
"""
x = nn.functional.relu(self.fc1(inputs))
lstm_out = self.lstm(
x, [torch.unsqueeze(state[0], 0),
torch.unsqueeze(state[1], 0)])
action_out = self.action_branch(lstm_out[0])
self._cur_value = torch.reshape(self.value_branch(lstm_out[0]), [-1])
return action_out, [
torch.squeeze(lstm_out[1][0], 0),
torch.squeeze(lstm_out[1][1], 0)
]
if __name__ == "__main__":
args = parser.parse_args()
ray.init(num_cpus=args.num_cpus or None)
ModelCatalog.register_custom_model("rnn", RNNModel)
tune.register_env(
"repeat_initial", lambda _: RepeatInitialObsEnv(episode_len=100))
tune.register_env(
"repeat_after_me", lambda _: RepeatAfterMeEnv({"repeat_delay": 1}))
tune.register_env("stateless_cartpole", lambda _: StatelessCartPole())
config = {
"env": args.env,
"use_pytorch": True,
"num_workers": 0,
"num_envs_per_worker": 20,
"gamma": 0.9,
"entropy_coeff": 0.0001,
"model": {
"custom_model": "rnn",
"max_seq_len": 20,
"lstm_use_prev_action_reward": "store_true",
"custom_options": {
"fc_size": args.fc_size,
"lstm_state_size": args.lstm_cell_size,
}
},
"lr": 3e-4,
"num_sgd_iter": 5,
"vf_loss_coeff": 0.0003,
}
tune.run(
args.run,
stop={
"episode_reward_mean": args.stop,
"timesteps_total": 100000
},
config=config,
)
+17 -58
View File
@@ -4,70 +4,20 @@ import random
import ray
from ray import tune
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.examples.models.eager_model import EagerModel
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tf_policy_template import build_tf_policy
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
tf = try_import_tf()
parser = argparse.ArgumentParser()
parser.add_argument("--iters", type=int, default=200)
class EagerModel(TFModelV2):
"""Example of using embedded eager execution in a custom model.
This shows how to use tf.py_function() to execute a snippet of TF code
in eager mode. Here the `self.forward_eager` method just prints out
the intermediate tensor for debug purposes, but you can in general
perform any TF eager operation in tf.py_function().
"""
def __init__(self, observation_space, action_space, num_outputs,
model_config, name):
super().__init__(observation_space, action_space, num_outputs,
model_config, name)
inputs = tf.keras.layers.Input(shape=observation_space.shape)
self.fcnet = FullyConnectedNetwork(
obs_space=self.obs_space,
action_space=self.action_space,
num_outputs=self.num_outputs,
model_config=self.model_config,
name="fc1")
out, value_out = self.fcnet.base_model(inputs)
def lambda_(x):
eager_out = tf.py_function(self.forward_eager, [x], tf.float32)
with tf.control_dependencies([eager_out]):
eager_out.set_shape(x.shape)
return eager_out
out = tf.keras.layers.Lambda(lambda_)(out)
self.base_model = tf.keras.models.Model(inputs, [out, value_out])
self.register_variables(self.base_model.variables)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
out, self._value_out = self.base_model(input_dict["obs"], state,
seq_lens)
return out, []
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
def forward_eager(self, feature_layer):
assert tf.executing_eagerly()
if random.random() > 0.99:
print("Eagerly printing the feature layer mean value",
tf.reduce_mean(feature_layer))
return feature_layer
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--stop-reward", type=float, default=150)
parser.add_argument("--as-test", action="store_true")
def policy_gradient_loss(policy, model, dist_class, train_batch):
@@ -119,5 +69,14 @@ if __name__ == "__main__":
"custom_model": "eager_model"
},
}
stop = {
"timesteps_total": args.stop_timesteps,
"training_iteration": args.stop_iters,
"episode_reward_mean": args.stop_reward,
}
tune.run(MyTrainer, stop={"training_iteration": args.iters}, config=config)
results = tune.run(MyTrainer, stop=stop, config=config)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+2 -2
View File
@@ -15,9 +15,9 @@ class RockPaperScissors(MultiAgentEnv):
SPOCK = 4
def __init__(self, config):
self.action_space = Discrete(3)
self.observation_space = Discrete(3)
self.sheldon_cooper = config.get("sheldon_cooper", False)
self.action_space = Discrete(5 if self.sheldon_cooper else 3)
self.observation_space = Discrete(5 if self.sheldon_cooper else 3)
self.player1 = "player1"
self.player2 = "player2"
self.last_move = None
+11 -13
View File
@@ -28,13 +28,16 @@ import logging
import ray
from ray import tune
from ray.tune import function
from ray.rllib.examples.env.windy_maze_env import WindyMazeEnv, \
HierarchicalWindyMazeEnv
from ray.tune import function
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--flat", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-reward", type=float, default=0.0)
parser.add_argument("--stop-timesteps", type=int, default=100000)
@@ -45,8 +48,9 @@ if __name__ == "__main__":
ray.init()
stop = {
"episode_reward_mean": args.stop_reward,
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
if args.flat:
@@ -92,15 +96,9 @@ if __name__ == "__main__":
"use_pytorch": args.torch,
}
results = tune.run(
"PPO",
stop=stop,
config=config,
)
results = tune.run("PPO", stop=stop, config=config)
# Error if stop-reward not reached.
if results.trials[0].last_result["episode_reward_mean"] < \
args.stop_reward:
raise ValueError("`stop-reward` of {} not reached!".format(
args.stop_reward))
print("ok")
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+58
View File
@@ -0,0 +1,58 @@
# Explains/tests Issues:
# https://github.com/ray-project/ray/issues/6928
# https://github.com/ray-project/ray/issues/6732
import argparse
from gym.spaces import Discrete, Box
import numpy as np
from ray.rllib.agents.ppo import PPOTrainer
from ray.rllib.examples.env.random_env import RandomEnv
from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \
MobileV2PlusRNNModel, TorchMobileV2PlusRNNModel
from ray.rllib.models import ModelCatalog
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
cnn_shape = (4, 4, 3)
# The torch version of MobileNetV2 does channels first.
cnn_shape_torch = (3, 224, 224)
parser = argparse.ArgumentParser()
parser.add_argument("--torch", action="store_true")
if __name__ == "__main__":
args = parser.parse_args()
# Register our custom model.
ModelCatalog.register_custom_model(
"my_model", TorchMobileV2PlusRNNModel
if args.torch else MobileV2PlusRNNModel)
# Configure our Trainer.
config = {
"use_pytorch": args.torch,
"model": {
"custom_model": "my_model",
# Extra config passed to the custom model's c'tor as kwargs.
"custom_options": {
"cnn_shape": cnn_shape_torch if args.torch else cnn_shape,
},
"max_seq_len": 20,
},
"vf_share_layers": True,
"num_workers": 0, # no parallelism
"env_config": {
"action_space": Discrete(2),
# Test a simple Image observation space.
"observation_space": Box(
0.0,
1.0,
shape=cnn_shape_torch if args.torch else cnn_shape,
dtype=np.float32)
},
}
trainer = PPOTrainer(config=config, env=RandomEnv)
print(trainer.train())
@@ -111,8 +111,10 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module):
# Central VF maps (obs, opp_obs, opp_act) -> vf_pred
input_size = 6 + 6 + 2 # obs + opp_obs + opp_act
self.central_vf_dense = SlimFC(input_size, 16, activation_fn=nn.Tanh)
self.central_vf_out = SlimFC(16, 1)
self.central_vf = nn.Sequential(
SlimFC(input_size, 16, activation_fn=nn.Tanh),
SlimFC(16, 1),
)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
@@ -122,10 +124,9 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module):
def central_value_function(self, obs, opponent_obs, opponent_actions):
input_ = torch.cat([
obs, opponent_obs,
torch.nn.functional.one_hot(opponent_actions, 2)
torch.nn.functional.one_hot(opponent_actions, 2).float()
], 1)
return torch.reshape(
self.central_vf_out(self.central_vf_dense(input_)), [-1])
return torch.reshape(self.central_vf(input_), [-1])
@override(ModelV2)
def value_function(self):
+164
View File
@@ -0,0 +1,164 @@
from ray.rllib.models.model import Model, restore_original_dimensions
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_action_dist import Categorical
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.models.torch.torch_action_dist import TorchCategorical
from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.offline import JsonReader
tf = try_import_tf()
torch, nn = try_import_torch()
class CustomLossModel(TFModelV2):
"""Custom model that adds an imitation loss on top of the policy loss."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name):
super().__init__(obs_space, action_space, num_outputs, model_config,
name)
self.fcnet = FullyConnectedNetwork(
self.obs_space,
self.action_space,
num_outputs,
model_config,
name="fcnet")
self.register_variables(self.fcnet.variables())
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
# Delegate to our FCNet.
return self.fcnet(input_dict, state, seq_lens)
@override(ModelV2)
def custom_loss(self, policy_loss, loss_inputs):
# Create a new input reader per worker.
reader = JsonReader(self.model_config["custom_options"]["input_files"])
input_ops = reader.tf_input_ops()
# Define a secondary loss by building a graph copy with weight sharing.
obs = restore_original_dimensions(
tf.cast(input_ops["obs"], tf.float32), self.obs_space)
logits, _ = self.forward({"obs": obs}, [], None)
# You can also add self-supervised losses easily by referencing tensors
# created during _build_layers_v2(). For example, an autoencoder-style
# loss can be added as follows:
# ae_loss = squared_diff(
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
# Compute the IL loss.
action_dist = Categorical(logits, self.model_config)
self.policy_loss = policy_loss
self.imitation_loss = tf.reduce_mean(
-action_dist.logp(input_ops["actions"]))
return policy_loss + 10 * self.imitation_loss
def custom_stats(self):
return {
"policy_loss": self.policy_loss,
"imitation_loss": self.imitation_loss,
}
class DeprecatedCustomLossModelV1(Model):
"""Model(V1) version of above custom-loss model."""
def _build_layers_v2(self, input_dict, num_outputs, options):
self.obs_in = input_dict["obs"]
with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
self.action_space, num_outputs,
options)
return self.fcnet.outputs, self.fcnet.last_layer
def custom_loss(self, policy_loss, loss_inputs):
# create a new input reader per worker
reader = JsonReader(self.options["custom_options"]["input_files"])
input_ops = reader.tf_input_ops()
# define a secondary loss by building a graph copy with weight sharing
obs = tf.cast(input_ops["obs"], tf.float32)
logits, _ = self._build_layers_v2({
"obs": restore_original_dimensions(obs, self.obs_space)
}, self.num_outputs, self.options)
# You can also add self-supervised losses easily by referencing tensors
# created during _build_layers_v2(). For example, an autoencoder-style
# loss can be added as follows:
# ae_loss = squared_diff(
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
# compute the IL loss
action_dist = Categorical(logits, self.options)
self.policy_loss = policy_loss
self.imitation_loss = tf.reduce_mean(
-action_dist.logp(input_ops["actions"]))
return policy_loss + 10 * self.imitation_loss
def custom_stats(self):
return {
"policy_loss": self.policy_loss,
"imitation_loss": self.imitation_loss,
}
class TorchCustomLossModel(TorchModelV2, nn.Module):
"""PyTorch version of the CustomLossModel above."""
def __init__(self, obs_space, action_space, num_outputs, model_config,
name, input_files):
super().__init__(obs_space, action_space, num_outputs, model_config,
name)
nn.Module.__init__(self)
self.input_files = input_files
self.fcnet = TorchFC(
self.obs_space,
self.action_space,
num_outputs,
model_config,
name="fcnet")
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
# Delegate to our FCNet.
return self.fcnet(input_dict, state, seq_lens)
@override(ModelV2)
def custom_loss(self, policy_loss, loss_inputs):
# Create a new input reader per worker.
reader = JsonReader(self.input_files)
input_ops = reader.tf_input_ops()
# Define a secondary loss by building a graph copy with weight sharing.
obs = restore_original_dimensions(
tf.cast(input_ops["obs"], tf.float32), self.obs_space)
logits, _ = self.forward({"obs": obs}, [], None)
# You can also add self-supervised losses easily by referencing tensors
# created during _build_layers_v2(). For example, an autoencoder-style
# loss can be added as follows:
# ae_loss = squared_diff(
# loss_inputs["obs"], Decoder(self.fcnet.last_layer))
print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
# Compute the IL loss.
action_dist = TorchCategorical(logits, self.model_config)
self.policy_loss = policy_loss
self.imitation_loss = torch.mean(
-action_dist.logp(input_ops["actions"]))
return policy_loss + 10 * self.imitation_loss
def custom_stats(self):
return {
"policy_loss": self.policy_loss,
"imitation_loss": self.imitation_loss,
}
+43 -94
View File
@@ -16,11 +16,11 @@ import random
import ray
from ray import tune
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.rllib.examples.models.shared_weights_model import \
SharedWeightsModel1, SharedWeightsModel2, TorchSharedWeightsModel
from ray.rllib.models import ModelCatalog
from ray.rllib.models.modelv2 import ModelV2
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.rllib.utils import try_import_tf
from ray.rllib.utils.annotations import override
from ray.rllib.utils.framework import try_import_tf
from ray.rllib.utils.test_utils import check_learning_achieved
tf = try_import_tf()
@@ -28,89 +28,31 @@ parser = argparse.ArgumentParser()
parser.add_argument("--num-agents", type=int, default=4)
parser.add_argument("--num-policies", type=int, default=2)
parser.add_argument("--num-iters", type=int, default=20)
parser.add_argument("--stop-iters", type=int, default=20)
parser.add_argument("--stop-reward", type=float, default=150)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--simple", action="store_true")
parser.add_argument("--num-cpus", type=int, default=0)
class CustomModel1(TFModelV2):
def __init__(self, observation_space, action_space, num_outputs,
model_config, name):
super().__init__(observation_space, action_space, num_outputs,
model_config, name)
inputs = tf.keras.layers.Input(observation_space.shape)
# Example of (optional) weight sharing between two different policies.
# Here, we share the variables defined in the 'shared' variable scope
# by entering it explicitly with tf.AUTO_REUSE. This creates the
# variables for the 'fc1' layer in a global scope called 'shared'
# outside of the policy's normal variable scope.
with tf.variable_scope(
tf.VariableScope(tf.AUTO_REUSE, "shared"),
reuse=tf.AUTO_REUSE,
auxiliary_name_scope=False):
last_layer = tf.keras.layers.Dense(
units=64, activation=tf.nn.relu, name="fc1")(inputs)
output = tf.keras.layers.Dense(
units=num_outputs, activation=None, name="fc_out")(last_layer)
vf = tf.keras.layers.Dense(
units=1, activation=None, name="value_out")(last_layer)
self.base_model = tf.keras.models.Model(inputs, [output, vf])
self.register_variables(self.base_model.variables)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
out, self._value_out = self.base_model(input_dict["obs"])
return out, []
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
class CustomModel2(TFModelV2):
def __init__(self, observation_space, action_space, num_outputs,
model_config, name):
super().__init__(observation_space, action_space, num_outputs,
model_config, name)
inputs = tf.keras.layers.Input(observation_space.shape)
# Weights shared with CustomModel1.
with tf.variable_scope(
tf.VariableScope(tf.AUTO_REUSE, "shared"),
reuse=tf.AUTO_REUSE,
auxiliary_name_scope=False):
last_layer = tf.keras.layers.Dense(
units=64, activation=tf.nn.relu, name="fc1")(inputs)
output = tf.keras.layers.Dense(
units=num_outputs, activation=None, name="fc_out")(last_layer)
vf = tf.keras.layers.Dense(
units=1, activation=None, name="value_out")(last_layer)
self.base_model = tf.keras.models.Model(inputs, [output, vf])
self.register_variables(self.base_model.variables)
@override(ModelV2)
def forward(self, input_dict, state, seq_lens):
out, self._value_out = self.base_model(input_dict["obs"])
return out, []
@override(ModelV2)
def value_function(self):
return tf.reshape(self._value_out, [-1])
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
if __name__ == "__main__":
args = parser.parse_args()
ray.init(num_cpus=args.num_cpus or None)
ModelCatalog.register_custom_model("model1", CustomModel1)
ModelCatalog.register_custom_model("model2", CustomModel2)
# Register the models to use.
mod1 = TorchSharedWeightsModel if args.torch else SharedWeightsModel1
mod2 = TorchSharedWeightsModel if args.torch else SharedWeightsModel2
ModelCatalog.register_custom_model("model1", mod1)
ModelCatalog.register_custom_model("model2", mod2)
# Get obs- and action Spaces.
single_env = gym.make("CartPole-v0")
obs_space = single_env.observation_space
act_space = single_env.action_space
# Each policy can have a different configuration (including custom model)
# Each policy can have a different configuration (including custom model).
def gen_policy(i):
config = {
"model": {
@@ -120,28 +62,35 @@ if __name__ == "__main__":
}
return (None, obs_space, act_space, config)
# Setup PPO with an ensemble of `num_policies` different policies
# Setup PPO with an ensemble of `num_policies` different policies.
policies = {
"policy_{}".format(i): gen_policy(i)
for i in range(args.num_policies)
}
policy_ids = list(policies.keys())
tune.run(
"PPO",
stop={"training_iteration": args.num_iters},
config={
"env": MultiAgentCartPole,
"env_config": {
"num_agents": args.num_agents,
},
"log_level": "DEBUG",
"simple_optimizer": args.simple,
"num_sgd_iter": 10,
"multiagent": {
"policies": policies,
"policy_mapping_fn": (
lambda agent_id: random.choice(policy_ids)),
},
config = {
"env": MultiAgentCartPole,
"env_config": {
"num_agents": args.num_agents,
},
)
"log_level": "DEBUG",
"simple_optimizer": args.simple,
"num_sgd_iter": 10,
"multiagent": {
"policies": policies,
"policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
},
"use_pytorch": args.torch,
}
stop = {
"episode_reward_mean": args.stop_reward,
"timesteps_total": args.stop_timesteps,
"training_iteration": args.stop_iters,
}
results = tune.run("PPO", stop=stop, config=config)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+25 -26
View File
@@ -18,32 +18,17 @@ import gym
import ray
from ray import tune
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.rllib.policy import Policy
from ray.tune.registry import register_env
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.rllib.examples.policy.random_policy import RandomPolicy
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=20)
class RandomPolicy(Policy):
"""Hand-coded policy that returns random actions."""
def compute_actions(self,
obs_batch,
state_batches=None,
prev_action_batch=None,
prev_reward_batch=None,
info_batch=None,
episodes=None,
**kwargs):
"""Compute actions on a batch of observations."""
return [self.action_space.sample() for _ in obs_batch], [], {}
def learn_on_batch(self, samples):
"""No learning."""
return {}
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=20)
parser.add_argument("--stop-reward", type=float, default=150)
parser.add_argument("--stop-timesteps", type=int, default=100000)
if __name__ == "__main__":
args = parser.parse_args()
@@ -56,18 +41,32 @@ if __name__ == "__main__":
obs_space = single_env.observation_space
act_space = single_env.action_space
tune.run(
stop = {
"training_iteration": args.stop_iters,
"episode_reward_mean": args.stop_reward,
"timesteps_total": args.stop_timesteps,
}
results = tune.run(
"PG",
stop={"training_iteration": args.num_iters},
stop=stop,
config={
"env": "multi_agent_cartpole",
"multiagent": {
"policies": {
"pg_policy": (None, obs_space, act_space, {}),
"pg_policy": (None, obs_space, act_space, {
"use_pytorch": args.torch
}),
"random": (RandomPolicy, obs_space, act_space, {}),
},
"policy_mapping_fn": (
lambda agent_id: ["pg_policy", "random"][agent_id % 2]),
},
"use_pytorch": args.torch,
},
)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+36 -10
View File
@@ -12,19 +12,27 @@ import argparse
import gym
import ray
from ray.rllib.agents.dqn.dqn import DQNTrainer
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.agents.ppo.ppo import PPOTrainer
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.dqn import DQNTrainer, DQNTFPolicy, DQNTorchPolicy
from ray.rllib.agents.ppo import PPOTrainer, PPOTFPolicy, PPOTorchPolicy
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.tune.logger import pretty_print
from ray.tune.registry import register_env
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=20)
# Use torch for both policies.
parser.add_argument("--torch", action="store_true")
# Mix PPO=tf and DQN=torch if set.
parser.add_argument("--mixed-torch-tf", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=20)
parser.add_argument("--stop-reward", type=float, default=50)
parser.add_argument("--stop-timesteps", type=int, default=100000)
if __name__ == "__main__":
args = parser.parse_args()
assert not (args.torch and args.mixed_torch_tf),\
"Use either --torch or --mixed-torch-tf, not both!"
ray.init()
# Simple environment with 4 independent cartpole entities
@@ -37,8 +45,10 @@ if __name__ == "__main__":
# You can also have multiple policies per trainer, but here we just
# show one each for PPO and DQN.
policies = {
"ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
"dqn_policy": (DQNTFPolicy, obs_space, act_space, {}),
"ppo_policy": (PPOTorchPolicy if args.torch else PPOTFPolicy,
obs_space, act_space, {}),
"dqn_policy": (DQNTorchPolicy if args.torch or args.mixed_torch_tf else
DQNTFPolicy, obs_space, act_space, {}),
}
def policy_mapping_fn(agent_id):
@@ -59,6 +69,7 @@ if __name__ == "__main__":
# disable filters, otherwise we would need to synchronize those
# as well to the DQN agent
"observation_filter": "NoFilter",
"use_pytorch": args.torch,
})
dqn_trainer = DQNTrainer(
@@ -71,6 +82,7 @@ if __name__ == "__main__":
},
"gamma": 0.95,
"n_step": 3,
"use_pytorch": args.torch or args.mixed_torch_tf,
})
# You should see both the printed X and Y approach 200 as this trains:
@@ -78,17 +90,31 @@ if __name__ == "__main__":
# policy_reward_mean:
# dqn_policy: X
# ppo_policy: Y
for i in range(args.num_iters):
for i in range(args.stop_iters):
print("== Iteration", i, "==")
# improve the DQN policy
print("-- DQN --")
print(pretty_print(dqn_trainer.train()))
result_dqn = dqn_trainer.train()
print(pretty_print(result_dqn))
# improve the PPO policy
print("-- PPO --")
print(pretty_print(ppo_trainer.train()))
result_ppo = ppo_trainer.train()
print(pretty_print(result_ppo))
# Test passed gracefully.
if args.as_test and \
result_dqn["episode_reward_mean"] > args.stop_reward and \
result_ppo["episode_reward_mean"] > args.stop_reward:
print("test passed (both agents above requested reward)")
quit(0)
# swap weights to synchronize
dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))
# Desired reward not reached.
if args.as_test:
raise ValueError("Desired reward ({}) not reached!".format(
args.stop_reward))
+21 -19
View File
@@ -1,22 +1,20 @@
import argparse
from gym.spaces import Dict, Tuple, Box, Discrete
import sys
import ray
import ray.tune as tune
from ray.tune.registry import register_env
from ray.rllib.examples.env.nested_space_repeat_after_me_env import \
NestedSpaceRepeatAfterMeEnv
from ray.rllib.utils import try_import_tree
from ray.rllib.utils.framework import try_import_tf
tf = try_import_tf()
tree = try_import_tree()
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--run", type=str, default="PPO")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop", type=int, default=90)
parser.add_argument("--max-trainstop", type=int, default=90)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-reward", type=float, default=0.0)
parser.add_argument("--stop-iters", type=int, default=100)
parser.add_argument("--stop-timesteps", type=int, default=100000)
parser.add_argument("--num-cpus", type=int, default=0)
if __name__ == "__main__":
@@ -40,19 +38,23 @@ if __name__ == "__main__":
},
"entropy_coeff": 0.00005, # We don't want high entropy in this Env.
"gamma": 0.0, # No history in Env (bandit problem).
"lr": 0.0003,
"lr": 0.0005,
"num_envs_per_worker": 20,
"num_sgd_iter": 20,
"num_sgd_iter": 4,
"num_workers": 0,
"use_pytorch": args.torch,
"vf_loss_coeff": 0.01,
"use_pytorch": args.torch,
}
import ray.rllib.agents.ppo as ppo
trainer = ppo.PPOTrainer(config=config)
for _ in range(100):
results = trainer.train()
print(results)
if results["episode_reward_mean"] > args.stop:
sys.exit(0) # Learnt, exit gracefully.
sys.exit(1) # Done, but did not learn, exit with error.
stop = {
"training_iteration": args.stop_iters,
"episode_reward_mean": args.stop_reward,
"timesteps_total": args.stop_timesteps,
}
results = tune.run(args.run, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+34 -74
View File
@@ -15,83 +15,34 @@ Working configurations are given below.
"""
import argparse
from gym.spaces import Box
import ray
from ray import tune
from ray.rllib.agents.dqn.distributional_q_tf_model import \
DistributionalQTFModel
from ray.tune.registry import register_env
from ray.rllib.examples.env.parametric_actions_cartpole import \
ParametricActionsCartPole
from ray.rllib.examples.models.parametric_actions_model import \
ParametricActionsModel, TorchParametricActionsModel
from ray.rllib.models import ModelCatalog
from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
from ray.rllib.models.tf.tf_modelv2 import TFModelV2
from ray.tune.registry import register_env
from ray.rllib.utils import try_import_tf
tf = try_import_tf()
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=200)
parser.add_argument("--run", type=str, default="PPO")
class ParametricActionsModel(DistributionalQTFModel, TFModelV2):
"""Parametric action model that handles the dot product and masking.
This assumes the outputs are logits for a single Categorical action dist.
Getting this to work with a more complex output (e.g., if the action space
is a tuple of several distributions) is also possible but left as an
exercise to the reader.
"""
def __init__(self,
obs_space,
action_space,
num_outputs,
model_config,
name,
true_obs_shape=(4, ),
action_embed_size=2,
**kw):
super(ParametricActionsModel, self).__init__(
obs_space, action_space, num_outputs, model_config, name, **kw)
self.action_embed_model = FullyConnectedNetwork(
Box(-1, 1, shape=true_obs_shape), action_space, action_embed_size,
model_config, name + "_action_embed")
self.register_variables(self.action_embed_model.variables())
def forward(self, input_dict, state, seq_lens):
# Extract the available actions tensor from the observation.
avail_actions = input_dict["obs"]["avail_actions"]
action_mask = input_dict["obs"]["action_mask"]
# Compute the predicted action embedding
action_embed, _ = self.action_embed_model({
"obs": input_dict["obs"]["cart"]
})
# Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
# avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
intent_vector = tf.expand_dims(action_embed, 1)
# Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)
# Mask out invalid actions (use tf.float32.min for stability)
inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
return action_logits + inf_mask, state
def value_function(self):
return self.action_embed_model.value_function()
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=200)
parser.add_argument("--stop-reward", type=float, default=150.0)
parser.add_argument("--stop-timesteps", type=int, default=100000)
if __name__ == "__main__":
args = parser.parse_args()
ray.init()
ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
ModelCatalog.register_custom_model(
"pa_model", TorchParametricActionsModel
if args.torch else ParametricActionsModel)
if args.run == "DQN":
cfg = {
# TODO(ekl) we need to set these to prevent the masked values
@@ -103,16 +54,25 @@ if __name__ == "__main__":
}
else:
cfg = {}
tune.run(
args.run,
stop={
"episode_reward_mean": args.stop,
config = dict({
"env": "pa_cartpole",
"model": {
"custom_model": "pa_model",
},
config=dict({
"env": "pa_cartpole",
"model": {
"custom_model": "pa_model",
},
"num_workers": 0,
}, **cfg),
)
"num_workers": 0,
"use_pytorch": args.torch,
}, **cfg)
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(args.run, stop=stop, config=config)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
@@ -8,99 +8,41 @@ This demonstrates running the following policies in competition:
"""
import argparse
import random
from gym.spaces import Discrete
import random
from ray import tune
from ray.rllib.agents.pg.pg import PGTrainer
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy
from ray.rllib.agents.registry import get_agent_class
from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
from ray.rllib.policy.policy import Policy
from ray.rllib.utils import try_import_tf
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=1000)
from ray.rllib.examples.policy.rock_paper_scissors_dummies import \
BeatLastHeuristic, AlwaysSameHeuristic
from ray.rllib.utils.framework import try_import_tf, try_import_torch
from ray.rllib.utils.test_utils import check_learning_achieved
tf = try_import_tf()
torch, _ = try_import_torch()
parser = argparse.ArgumentParser()
parser.add_argument("--torch", action="store_true")
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--stop-iters", type=int, default=150)
parser.add_argument("--stop-reward", type=float, default=1000.0)
parser.add_argument("--stop-timesteps", type=int, default=100000)
class AlwaysSameHeuristic(Policy):
"""Pick a random move and stick with it for the entire episode."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.exploration = self._create_exploration()
def get_initial_state(self):
return [
random.choice([
RockPaperScissors.ROCK, RockPaperScissors.PAPER,
RockPaperScissors.SCISSORS
])
]
def compute_actions(self,
obs_batch,
state_batches=None,
prev_action_batch=None,
prev_reward_batch=None,
info_batch=None,
episodes=None,
**kwargs):
return state_batches[0], state_batches, {}
def learn_on_batch(self, samples):
pass
def get_weights(self):
pass
def set_weights(self, weights):
pass
class BeatLastHeuristic(Policy):
"""Play the move that would beat the last move of the opponent."""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.exploration = self._create_exploration()
def compute_actions(self,
obs_batch,
state_batches=None,
prev_action_batch=None,
prev_reward_batch=None,
info_batch=None,
episodes=None,
**kwargs):
def successor(x):
if x[RockPaperScissors.ROCK] == 1:
return RockPaperScissors.PAPER
elif x[RockPaperScissors.PAPER] == 1:
return RockPaperScissors.SCISSORS
elif x[RockPaperScissors.SCISSORS] == 1:
return RockPaperScissors.ROCK
return [successor(x) for x in obs_batch], [], {}
def learn_on_batch(self, samples):
pass
def get_weights(self):
pass
def set_weights(self, weights):
pass
def run_same_policy(args):
def run_same_policy(args, stop):
"""Use the same policy for both agents (trivial case)."""
config = {
"env": RockPaperScissors,
"use_pytorch": args.torch,
}
tune.run(
"PG",
stop={"timesteps_total": args.stop},
config={"env": RockPaperScissors})
results = tune.run("PG", config=config, stop=stop)
if args.as_test:
# Check vs 0.0 as we are playing a zero-sum game.
check_learning_achieved(results, 0.0)
def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
@@ -134,16 +76,35 @@ def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
"learned": (None, Discrete(3), Discrete(3), {
"model": {
"use_lstm": use_lstm
}
},
"use_pytorch": args.torch,
}),
},
"policy_mapping_fn": select_policy,
},
"use_pytorch": args.torch,
}
tune.run(trainer, stop={"timesteps_total": args.stop}, config=config)
cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer
trainer_obj = cls(config=config)
env = trainer_obj.workers.local_worker().env
for _ in range(args.stop_iters):
results = trainer_obj.train()
print(results)
# Timesteps reached.
if results["timesteps_total"] > args.stop_timesteps:
break
# Reward (difference) reached -> all good, return.
elif env.player1_score - env.player2_score > args.stop_reward:
return
# Reward (difference) not reached: Error if `as_test`.
if args.as_test:
raise ValueError(
"Desired reward difference ({}) not reached! Only got to {}.".
format(args.stop_reward, env.player1_score - env.player2_score))
def run_with_custom_entropy_loss(args):
def run_with_custom_entropy_loss(args, stop):
"""Example of customizing the loss function of an existing policy.
This performs about the same as the default loss does."""
@@ -151,24 +112,44 @@ def run_with_custom_entropy_loss(args):
def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
logits, _ = model.from_batch(train_batch)
action_dist = dist_class(logits, model)
return (-0.1 * action_dist.entropy() - tf.reduce_mean(
action_dist.logp(train_batch["actions"]) *
train_batch["advantages"]))
if args.torch:
# required by PGTorchPolicy's stats fn.
policy.pi_err = torch.tensor([0.0])
return torch.mean(-0.1 * action_dist.entropy() -
(action_dist.logp(train_batch["actions"]) *
train_batch["advantages"]))
else:
return (-0.1 * action_dist.entropy() - tf.reduce_mean(
action_dist.logp(train_batch["actions"]) *
train_batch["advantages"]))
EntropyPolicy = PGTFPolicy.with_updates(
policy_cls = PGTorchPolicy if args.torch else PGTFPolicy
EntropyPolicy = policy_cls.with_updates(
loss_fn=entropy_policy_gradient_loss)
EntropyLossPG = PGTrainer.with_updates(
name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG)
if __name__ == "__main__":
args = parser.parse_args()
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
run_same_policy(args, stop=stop)
print("run_same_policy: ok.")
run_heuristic_vs_learned(args, use_lstm=False)
print("run_heuristic_vs_learned(w/o lstm): ok.")
run_same_policy(args)
print("run_same_policy: ok.")
run_heuristic_vs_learned(args, use_lstm=True)
print("run_heuristic_vs_learned (w/ lstm): ok.")
run_with_custom_entropy_loss(args)
run_with_custom_entropy_loss(args, stop=stop)
print("run_with_custom_entropy_loss: ok.")
@@ -13,8 +13,8 @@ import ray
from ray import tune
from ray.rllib.evaluation import RolloutWorker
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.policy.policy import Policy
from ray.rllib.policy.sample_batch import SampleBatch
from ray.rllib.policy.tests.test_policy import TestPolicy
parser = argparse.ArgumentParser()
parser.add_argument("--gpu", action="store_true")
@@ -23,7 +23,7 @@ parser.add_argument("--num-workers", type=int, default=2)
parser.add_argument("--num-cpus", type=int, default=0)
class CustomPolicy(TestPolicy):
class CustomPolicy(Policy):
"""Example of a custom policy written from scratch.
You might find it more convenient to extend TF/TorchPolicy instead
+40 -16
View File
@@ -13,8 +13,10 @@ from ray import tune
from ray.rllib.agents.trainer_template import build_trainer
from ray.rllib.agents.dqn.dqn import DEFAULT_CONFIG as DQN_CONFIG
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy
from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG as PPO_CONFIG
from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.execution.common import _get_shared_metrics
from ray.rllib.execution.concurrency_ops import Concurrently
@@ -25,10 +27,16 @@ from ray.rllib.execution.replay_ops import StoreToReplayBuffer, Replay
from ray.rllib.execution.train_ops import TrainOneStep, UpdateTargetNetwork
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
from ray.rllib.optimizers.async_replay_optimizer import LocalReplayBuffer
from ray.rllib.utils.test_utils import check_learning_achieved
from ray.tune.registry import register_env
parser = argparse.ArgumentParser()
parser.add_argument("--num-iters", type=int, default=20)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--mixed-torch-tf", action="store_true")
parser.add_argument("--stop-iters", type=int, default=20)
parser.add_argument("--stop-reward", type=float, default=150.0)
parser.add_argument("--stop-timesteps", type=int, default=100000)
def custom_training_workflow(workers: WorkerSet, config: dict):
@@ -90,6 +98,9 @@ def custom_training_workflow(workers: WorkerSet, config: dict):
if __name__ == "__main__":
args = parser.parse_args()
assert not (args.torch and args.mixed_torch_tf),\
"Use either --torch or --mixed-torch-tf, not both!"
ray.init()
# Simple environment with 4 independent cartpole entities
@@ -102,8 +113,10 @@ if __name__ == "__main__":
# Note that since the trainer below does not include a default policy or
# policy configs, we have to explicitly set it in the multiagent config:
policies = {
"ppo_policy": (PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
"dqn_policy": (DQNTFPolicy, obs_space, act_space, DQN_CONFIG),
"ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else
PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
"dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy,
obs_space, act_space, DQN_CONFIG),
}
def policy_mapping_fn(agent_id):
@@ -117,16 +130,27 @@ if __name__ == "__main__":
default_policy=None,
execution_plan=custom_training_workflow)
tune.run(
MyTrainer,
stop={"training_iteration": args.num_iters},
config={
"rollout_fragment_length": 50,
"num_workers": 0,
"env": "multi_agent_cartpole",
"multiagent": {
"policies": policies,
"policy_mapping_fn": policy_mapping_fn,
"policies_to_train": ["dqn_policy", "ppo_policy"],
},
})
config = {
"rollout_fragment_length": 50,
"num_workers": 0,
"env": "multi_agent_cartpole",
"multiagent": {
"policies": policies,
"policy_mapping_fn": policy_mapping_fn,
"policies_to_train": ["dqn_policy", "ppo_policy"],
},
"use_pytorch": args.torch,
}
stop = {
"training_iteration": args.stop_iters,
"timesteps_total": args.stop_timesteps,
"episode_reward_mean": args.stop_reward,
}
results = tune.run(MyTrainer, config=config, stop=stop)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+24 -10
View File
@@ -17,11 +17,15 @@ from ray import tune
from ray.tune import register_env, grid_search
from ray.rllib.env.multi_agent_env import ENV_STATE
from ray.rllib.examples.env.two_step_game import TwoStepGame
from ray.rllib.utils.test_utils import check_learning_achieved
parser = argparse.ArgumentParser()
parser.add_argument("--stop", type=int, default=50000)
parser.add_argument("--run", type=str, default="PG")
parser.add_argument("--num-cpus", type=int, default=0)
parser.add_argument("--as-test", action="store_true")
parser.add_argument("--torch", action="store_true")
parser.add_argument("--stop-reward", type=float, default=7.0)
parser.add_argument("--stop-timesteps", type=int, default=50000)
if __name__ == "__main__":
args = parser.parse_args()
@@ -73,6 +77,7 @@ if __name__ == "__main__":
},
"policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
},
"use_pytorch": args.torch,
}
group = False
elif args.run == "QMIX":
@@ -87,6 +92,7 @@ if __name__ == "__main__":
"separate_state_space": True,
"one_hot_state_encoding": True
},
"use_pytorch": args.torch,
}
group = True
elif args.run == "APEX_QMIX":
@@ -107,6 +113,7 @@ if __name__ == "__main__":
"separate_state_space": True,
"one_hot_state_encoding": True
},
"use_pytorch": args.torch,
}
group = True
else:
@@ -114,12 +121,19 @@ if __name__ == "__main__":
group = False
ray.init(num_cpus=args.num_cpus or None)
tune.run(
args.run,
stop={
"timesteps_total": args.stop,
},
config=dict(config, **{
"env": "grouped_twostep" if group else TwoStepGame,
}),
)
stop = {
"episode_reward_mean": args.stop_reward,
"timesteps_total": args.stop_timesteps,
}
config = dict(config, **{
"env": "grouped_twostep" if group else TwoStepGame,
})
results = tune.run(args.run, stop=stop, config=config)
if args.as_test:
check_learning_achieved(results, args.stop_reward)
ray.shutdown()
+6 -6
View File
@@ -7,16 +7,16 @@ from ray.tune.registry import register_env
from ray.rllib.agents.pg import PGTrainer
from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.examples.env.multi_agent import MultiAgentCartPole, \
BasicMultiAgent, EarlyDoneMultiAgent, RoundRobinMultiAgent
from ray.rllib.examples.policy.random_policy import RandomPolicy
from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
AsyncGradientsOptimizer)
from ray.rllib.tests.test_rollout_worker import MockPolicy
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.policy.tests.test_policy import TestPolicy
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.evaluation.worker_set import WorkerSet
from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
def one_hot(i, n):
@@ -297,7 +297,7 @@ class TestMultiAgentEnv(unittest.TestCase):
def test_custom_rnn_state_values(self):
h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}
class StatefulPolicy(TestPolicy):
class StatefulPolicy(RandomPolicy):
def compute_actions(self,
obs_batch,
state_batches=None,
+3 -3
View File
@@ -13,13 +13,13 @@ from ray.rllib.env.vector_env import VectorEnv
from ray.rllib.evaluation.rollout_worker import RolloutWorker
from ray.rllib.evaluation.metrics import collect_metrics
from ray.rllib.evaluation.postprocessing import compute_advantages
from ray.rllib.policy.tests.test_policy import TestPolicy
from ray.rllib.examples.policy.random_policy import RandomPolicy
from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
from ray.rllib.utils.test_utils import check
from ray.tune.registry import register_env
class MockPolicy(TestPolicy):
class MockPolicy(RandomPolicy):
def compute_actions(self,
obs_batch,
state_batches=None,
@@ -40,7 +40,7 @@ class MockPolicy(TestPolicy):
batch, 100.0, 0.9, use_gae=False, use_critic=False)
class BadPolicy(MockPolicy):
class BadPolicy(RandomPolicy):
def compute_actions(self,
obs_batch,
state_batches=None,