From 57544b1ff9f97d4da9f64d25c8ea5a3d8d247ffc Mon Sep 17 00:00:00 2001 From: Sven Mika Date: Tue, 12 May 2020 08:23:10 +0200 Subject: [PATCH] [RLlib] Examples folder restructuring (Model examples; final part). (#8278) - This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward. --- .travis.yml | 14 +- rllib/BUILD | 500 +++++++++++++----- rllib/agents/a3c/a3c_torch_policy.py | 9 +- rllib/agents/ddpg/ddpg_torch_model.py | 6 +- .../bandits/examples/simple_context_bandit.py | 33 +- rllib/examples/autoregressive_action_dist.py | 209 ++------ rllib/examples/batch_norm_model.py | 160 +----- rllib/examples/cartpole_lstm.py | 44 +- rllib/examples/centralized_critic.py | 182 ++++--- rllib/examples/centralized_critic_2.py | 117 ++-- rllib/examples/custom_env.py | 94 +++- rllib/examples/custom_eval.py | 75 +-- rllib/examples/custom_fast_model.py | 82 +-- .../custom_keras_cnn_plus_rnn_model.py | 115 ---- rllib/examples/custom_keras_model.py | 2 +- rllib/examples/custom_keras_rnn_model.py | 117 ---- rllib/examples/custom_loss.py | 88 +-- .../examples/custom_metrics_and_callbacks.py | 4 +- .../custom_metrics_and_callbacks_legacy.py | 4 +- rllib/examples/custom_rnn_model.py | 62 +++ rllib/examples/custom_tf_policy.py | 4 +- rllib/examples/custom_torch_policy.py | 4 +- rllib/examples/custom_torch_rnn_model.py | 128 ----- rllib/examples/eager_execution.py | 75 +-- rllib/examples/env/rock_paper_scissors.py | 4 +- rllib/examples/hierarchical_training.py | 24 +- rllib/examples/mobilenet_v2_with_lstm.py | 58 ++ .../models/centralized_critic_models.py | 11 +- rllib/examples/models/custom_loss_model.py | 164 ++++++ rllib/examples/multi_agent_cartpole.py | 137 ++--- rllib/examples/multi_agent_custom_policy.py | 51 +- rllib/examples/multi_agent_two_trainers.py | 46 +- rllib/examples/nested_action_spaces.py | 40 +- rllib/examples/parametric_actions_cartpole.py | 108 ++-- .../rock_paper_scissors_multiagent.py | 167 +++--- .../rollout_worker_custom_workflow.py | 4 +- rllib/examples/two_trainer_workflow.py | 56 +- rllib/examples/twostep_game.py | 34 +- ...test_policy.py => OBSOLETE_test_policy.py} | 0 rllib/tests/test_multi_agent_env.py | 12 +- rllib/tests/test_rollout_worker.py | 6 +- 41 files changed, 1466 insertions(+), 1584 deletions(-) delete mode 100644 rllib/examples/custom_keras_cnn_plus_rnn_model.py delete mode 100644 rllib/examples/custom_keras_rnn_model.py create mode 100644 rllib/examples/custom_rnn_model.py delete mode 100644 rllib/examples/custom_torch_rnn_model.py create mode 100644 rllib/examples/mobilenet_v2_with_lstm.py create mode 100644 rllib/examples/models/custom_loss_model.py rename rllib/policy/tests/{test_policy.py => OBSOLETE_test_policy.py} (100%) diff --git a/.travis.yml b/.travis.yml index b8fdedb40..cd1a4a891 100644 --- a/.travis.yml +++ b/.travis.yml @@ -219,14 +219,14 @@ matrix: - . ./ci/travis/ci.sh build script: - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_A,examples_B rllib/... - - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C rllib/... - - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_L,examples_M,examples_N,examples_P rllib/... - - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_U,examples_R,examples_S,examples_T rllib/... + - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C,examples_D rllib/... + - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P rllib/... + - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z rllib/... # RLlib: tests_dir: Everything in rllib/tests/ directory (A-I). - os: linux env: - - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_I=1 + - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_L=1 - PYTHON=3.6 - TF_VERSION=2.0.0b1 - TFP_VERSION=0.8 @@ -237,12 +237,12 @@ matrix: before_script: - . ./ci/travis/ci.sh build script: - - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I rllib/... + - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L rllib/... # RLlib: tests_dir: Everything in rllib/tests/ directory (J-Z). - os: linux env: - - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_J_TO_Z=1 + - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_M_TO_Z=1 - PYTHON=3.6 - TF_VERSION=2.0.0b1 - TFP_VERSION=0.8 @@ -253,7 +253,7 @@ matrix: before_script: - . ./ci/travis/ci.sh build script: - - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_J,tests_dir_K,tests_dir_L,tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/... + - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/... # Cpp worker test - os: linux diff --git a/rllib/BUILD b/rllib/BUILD index a01ff851f..20c7d0460 100644 --- a/rllib/BUILD +++ b/rllib/BUILD @@ -27,8 +27,8 @@ # 2) everything in a) using tf1.x # 3) everything in b) c) d) and e) # 4) everything in g) -# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_I` -# 6) f), BUT only those tagged `tests_dir_J` to `tests_dir_Z` +# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_L` +# 6) f), BUT only those tagged `tests_dir_M` to `tests_dir_Z` # -------------------------------------------------------------------- @@ -1453,95 +1453,215 @@ py_test( py_test( - name = "examples/autoregressive_action_dist", main = "examples/autoregressive_action_dist.py", + name = "examples/autoregressive_action_dist_tf", + main = "examples/autoregressive_action_dist.py", tags = ["examples", "examples_A"], - size = "large", + size = "medium", srcs = ["examples/autoregressive_action_dist.py"], - args = ["--stop=150", "--num-cpus=4"] + args = ["--as-test", "--stop-reward=150", "--num-cpus=4"] ) py_test( - name = "examples/batch_norm_model_ppo", main="examples/batch_norm_model.py", + name = "examples/autoregressive_action_dist_torch", + main = "examples/autoregressive_action_dist.py", + tags = ["examples", "examples_A"], + size = "medium", + srcs = ["examples/autoregressive_action_dist.py"], + args = ["--as-test", "--torch", "--stop-reward=150", "--num-cpus=4"] +) + +py_test( + name = "examples/batch_norm_model_ppo_tf", + main = "examples/batch_norm_model.py", tags = ["examples", "examples_B"], size = "medium", srcs = ["examples/batch_norm_model.py"], - args = ["--run=PPO", "--num-iters=1"] + args = ["--as-test", "--run=PPO", "--stop-reward=80"] ) py_test( - name = "examples/batch_norm_model_pg", main="examples/batch_norm_model.py", + name = "examples/batch_norm_model_ppo_torch", + main = "examples/batch_norm_model.py", tags = ["examples", "examples_B"], size = "medium", srcs = ["examples/batch_norm_model.py"], - args = ["--run=PG", "--num-iters=1"] + args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=80"] ) py_test( - name = "examples/batch_norm_model_dqn", main="examples/batch_norm_model.py", + name = "examples/batch_norm_model_dqn_tf", + main = "examples/batch_norm_model.py", tags = ["examples", "examples_B"], - size = "medium", + size = "medium", # DQN learns much slower with BatchNorm. srcs = ["examples/batch_norm_model.py"], - args = ["--run=DQN", "--num-iters=1"] + args = ["--as-test", "--run=DQN", "--stop-reward=70"] ) py_test( - name = "examples/batch_norm_model_ddpg", main="examples/batch_norm_model.py", + name = "examples/batch_norm_model_dqn_torch", + main = "examples/batch_norm_model.py", tags = ["examples", "examples_B"], - size = "medium", + size = "medium", # DQN learns much slower with BatchNorm. srcs = ["examples/batch_norm_model.py"], - args = ["--run=DDPG", "--num-iters=1"] + args = ["--as-test", "--torch", "--run=DQN", "--stop-reward=70"] ) py_test( - name = "examples/cartpole_lstm_impala", main="examples/cartpole_lstm.py", + name = "examples/batch_norm_model_ddpg_tf", + main = "examples/batch_norm_model.py", + tags = ["examples", "examples_B"], + size = "small", + srcs = ["examples/batch_norm_model.py"], + args = ["--run=DDPG", "--stop-iters=1"] +) + +py_test( + name = "examples/batch_norm_model_ddpg_torch", + main = "examples/batch_norm_model.py", + tags = ["examples", "examples_B"], + size = "small", + srcs = ["examples/batch_norm_model.py"], + args = ["--torch", "--run=DDPG", "--stop-iters=1"] +) + +py_test( + name = "examples/cartpole_lstm_impala_tf", + main = "examples/cartpole_lstm.py", + tags = ["examples", "examples_C"], + size = "small", + srcs = ["examples/cartpole_lstm.py"], + args = ["--as-test", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"] +) + +py_test( + name = "examples/cartpole_lstm_impala_torch", + main = "examples/cartpole_lstm.py", tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/cartpole_lstm.py"], - args = ["--run=IMPALA", "--stop=40", "--num-cpus=4"] + args = ["--as-test", "--torch", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"] ) py_test( - name = "examples/cartpole_lstm_ppo", main="examples/cartpole_lstm.py", + name = "examples/cartpole_lstm_ppo_tf", + main = "examples/cartpole_lstm.py", tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/cartpole_lstm.py"], - args = ["--run=PPO", "--stop=40", "--num-cpus=4"] + args = ["--as-test", "--run=PPO", "--stop-reward=40", "--num-cpus=4"] ) py_test( - name = "examples/cartpole_lstm_ppo_with_prev_a_and_r", main="examples/cartpole_lstm.py", + name = "examples/cartpole_lstm_ppo_torch", + main = "examples/cartpole_lstm.py", tags = ["examples", "examples_C"], - size = "large", + size = "small", srcs = ["examples/cartpole_lstm.py"], - args = ["--run=PPO", "--stop=40", "--use-prev-action-reward", "--num-cpus=4"] + args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--num-cpus=4"] ) py_test( - name = "examples/centralized_critic", + name = "examples/cartpole_lstm_ppo_tf_with_prev_a_and_r", + main = "examples/cartpole_lstm.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/cartpole_lstm.py"], + args = ["--as-test", "--run=PPO", "--stop-reward=40", "--use-prev-action-reward", "--num-cpus=4"] +) + +py_test( + name = "examples/centralized_critic_tf", + main = "examples/centralized_critic.py", tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/centralized_critic.py"], - args = ["--stop=2000"] + args = ["--as-test", "--stop-reward=7.2"] ) py_test( - name = "examples/centralized_critic_2", + name = "examples/centralized_critic_torch", + main = "examples/centralized_critic.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/centralized_critic.py"], + args = ["--as-test", "--torch", "--stop-reward=7.2"] +) + +py_test( + name = "examples/centralized_critic_2_tf", + main = "examples/centralized_critic_2.py", tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/centralized_critic_2.py"], - args = ["--stop=2000"] + args = ["--as-test", "--stop-reward=6.0"] ) py_test( - name = "examples/custom_eval", main = "examples/custom_eval.py", + name = "examples/centralized_critic_2_torch", + main = "examples/centralized_critic_2.py", tags = ["examples", "examples_C"], size = "medium", - srcs = ["examples/custom_eval.py"], - args = ["--custom-eval", "--num-cpus=4"] + srcs = ["examples/centralized_critic_2.py"], + args = ["--as-test", "--torch", "--stop-reward=6.0"] ) py_test( - name = "examples/custom_keras_model_a2c", main="examples/custom_keras_model.py", + name = "examples/custom_env_tf", + main = "examples/custom_env.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/custom_env.py"], + args = ["--as-test"] +) + +py_test( + name = "examples/custom_env_torch", + main = "examples/custom_env.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/custom_env.py"], + args = ["--as-test", "--torch"] +) + +py_test( + name = "examples/custom_eval_tf", + main = "examples/custom_eval.py", + tags = ["examples", "examples_C"], + size = "small", + srcs = ["examples/custom_eval.py"], + args = ["--num-cpus=4"] +) + +py_test( + name = "examples/custom_eval_torch", + main = "examples/custom_eval.py", + tags = ["examples", "examples_C"], + size = "small", + srcs = ["examples/custom_eval.py"], + args = ["--torch", "--num-cpus=4"] +) + +py_test( + name = "examples/custom_fast_model_tf", + main = "examples/custom_fast_model.py", + tags = ["examples", "examples_C"], + size = "small", + srcs = ["examples/custom_fast_model.py"], + args = ["--stop-iters=1", "--num-cpus=4"] +) + +py_test( + name = "examples/custom_fast_model_torch", + main = "examples/custom_fast_model.py", + tags = ["examples", "examples_C"], + size = "small", + srcs = ["examples/custom_fast_model.py"], + args = ["--torch", "--stop-iters=1", "--num-cpus=4"] +) + +py_test( + name = "examples/custom_keras_model_a2c", + main = "examples/custom_keras_model.py", tags = ["examples", "examples_C"], size = "large", srcs = ["examples/custom_keras_model.py"], @@ -1549,7 +1669,8 @@ py_test( ) py_test( - name = "examples/custom_keras_model_dqn", main="examples/custom_keras_model.py", + name = "examples/custom_keras_model_dqn", + main = "examples/custom_keras_model.py", tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/custom_keras_model.py"], @@ -1557,7 +1678,8 @@ py_test( ) py_test( - name = "examples/custom_keras_model_ppo", main="examples/custom_keras_model.py", + name = "examples/custom_keras_model_ppo", + main = "examples/custom_keras_model.py", tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/custom_keras_model.py"], @@ -1565,46 +1687,79 @@ py_test( ) py_test( - name = "examples/custom_keras_rnn_model_repeat_after_me", main = "examples/custom_keras_rnn_model.py", - tags = ["examples", "examples_C"], - size = "large", - srcs = ["examples/custom_keras_rnn_model.py"], - args = ["--run=PPO", "--stop=50", "--env=RepeatAfterMeEnv", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_keras_rnn_model_repeat_initial", - main = "examples/custom_keras_rnn_model.py", - tags = ["examples", "examples_C"], - size = "large", - srcs = ["examples/custom_keras_rnn_model.py"], - args = ["--run=PPO", "--stop=50", "--env=RepeatInitialObsEnv", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_loss", + name = "examples/custom_loss_tf", + main = "examples/custom_loss.py", tags = ["examples", "examples_C"], size = "small", # Include the json data file. data = glob(["tests/data/cartpole_small/**"]), srcs = ["examples/custom_loss.py"], - args = ["--iters=2", "--input-files=tests/data/cartpole_small"] + args = ["--stop-iters=2", "--input-files=tests/data/cartpole_small"] +) + +py_test( + name = "examples/custom_loss_torch", + main = "examples/custom_loss.py", + tags = ["examples", "examples_C"], + size = "small", + # Include the json data file. + data = glob(["tests/data/cartpole_small/**"]), + srcs = ["examples/custom_loss.py"], + args = ["--torch", "--stop-iters=2", "--input-files=tests/data/cartpole_small"] ) py_test( name = "examples/custom_metrics_and_callbacks", + main = "examples/custom_metrics_and_callbacks.py", tags = ["examples", "examples_C"], size = "small", srcs = ["examples/custom_metrics_and_callbacks.py"], - args = ["--num-iters=2"] + args = ["--stop-iters=2"] ) py_test( name = "examples/custom_metrics_and_callbacks_legacy", + main = "examples/custom_metrics_and_callbacks_legacy.py", tags = ["examples", "examples_C"], size = "small", srcs = ["examples/custom_metrics_and_callbacks_legacy.py"], - args = ["--num-iters=2"] + args = ["--stop-iters=2"] +) + +py_test( + name = "examples/custom_rnn_model_repeat_after_me_tf", + main = "examples/custom_rnn_model.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/custom_rnn_model.py"], + args = ["--as-test", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] +) + +py_test( + name = "examples/custom_rnn_model_repeat_initial_obs_tf", + main = "examples/custom_rnn_model.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/custom_rnn_model.py"], + args = ["--as-test", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] +) + +py_test( + name = "examples/custom_rnn_model_repeat_after_me_torch", + main = "examples/custom_rnn_model.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/custom_rnn_model.py"], + args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"] +) + +py_test( + name = "examples/custom_rnn_model_repeat_initial_obs_torch", + main = "examples/custom_rnn_model.py", + tags = ["examples", "examples_C"], + size = "medium", + srcs = ["examples/custom_rnn_model.py"], + args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"] ) py_test( @@ -1612,16 +1767,7 @@ py_test( tags = ["examples", "examples_C"], size = "medium", srcs = ["examples/custom_tf_policy.py"], - args = ["--iters=2", "--num-cpus=4"] -) - -py_test( - name = "examples/custom_torch_rnn_model", - main = "examples/custom_torch_rnn_model.py", - tags = ["examples", "examples_C"], - size = "medium", - srcs = ["examples/custom_torch_rnn_model.py"], - args = ["--run=PPO", "--stop=90", "--num-cpus=4"] + args = ["--stop-iters=2", "--num-cpus=4"] ) py_test( @@ -1629,7 +1775,7 @@ py_test( tags = ["examples", "examples_C"], size = "small", srcs = ["examples/custom_torch_policy.py"], - args = ["--iters=2", "--num-cpus=4"] + args = ["--stop-iters=2", "--num-cpus=4"] ) py_test( @@ -1637,90 +1783,151 @@ py_test( tags = ["examples", "examples_E"], size = "small", srcs = ["examples/eager_execution.py"], - args = ["--iters=2"] + args = ["--stop-iters=2"] ) py_test( name = "examples/hierarchical_training_tf", + main = "examples/hierarchical_training.py", tags = ["examples", "examples_H"], - size = "small", + size = "medium", srcs = ["examples/hierarchical_training.py"], args = ["--stop-reward=0.0"] ) py_test( name = "examples/hierarchical_training_torch", + main = "examples/hierarchical_training.py", tags = ["examples", "examples_H"], - size = "small", + size = "medium", srcs = ["examples/hierarchical_training.py"], args = ["--torch", "--stop-reward=0.0"] ) +# Do not run this test (MobileNetV2 is gigantic and takes forever for 1 iter). +# py_test( +# name = "examples/mobilenet_v2_with_lstm_tf", +# main = "examples/mobilenet_v2_with_lstm.py", +# tags = ["examples", "examples_M"], +# size = "small", +# srcs = ["examples/mobilenet_v2_with_lstm.py"] +# ) + py_test( - name = "examples/multi_agent_cartpole", + name = "examples/multi_agent_cartpole_tf", + main = "examples/multi_agent_cartpole.py", tags = ["examples", "examples_M"], size = "medium", srcs = ["examples/multi_agent_cartpole.py"], - args = ["--num-iters=2", "--num-cpus=4"] + args = ["--as-test", "--stop-reward=70.0", "--num-cpus=4"] ) py_test( - name = "examples/multi_agent_custom_policy", + name = "examples/multi_agent_cartpole_torch", + main = "examples/multi_agent_cartpole.py", tags = ["examples", "examples_M"], - size = "medium", - srcs = ["examples/multi_agent_custom_policy.py"], + size = "small", + srcs = ["examples/multi_agent_cartpole.py"], + args = ["--as-test", "--torch", "--stop-reward=70.0", "--num-cpus=4"] ) py_test( - name = "examples/multi_agent_two_trainers", + name = "examples/multi_agent_custom_policy_tf", + main = "examples/multi_agent_custom_policy.py", + tags = ["examples", "examples_M"], + size = "small", + srcs = ["examples/multi_agent_custom_policy.py"], + args = ["--as-test", "--stop-reward=80"] +) + +py_test( + name = "examples/multi_agent_custom_policy_torch", + main = "examples/multi_agent_custom_policy.py", + tags = ["examples", "examples_M"], + size = "small", + srcs = ["examples/multi_agent_custom_policy.py"], + args = ["--as-test", "--torch", "--stop-reward=80"] +) + +py_test( + name = "examples/multi_agent_two_trainers_tf", + main = "examples/multi_agent_two_trainers.py", tags = ["examples", "examples_M"], size = "medium", srcs = ["examples/multi_agent_two_trainers.py"], - args = ["--num-iters=2"] + args = ["--as-test", "--stop-reward=70"] ) py_test( - name = "examples/two_trainer_workflow", - tags = ["examples", "examples_T"], + name = "examples/multi_agent_two_trainers_torch", + main = "examples/multi_agent_two_trainers.py", + tags = ["examples", "examples_M"], size = "medium", - srcs = ["examples/two_trainer_workflow.py"], - args = ["--num-iters=2"] + srcs = ["examples/multi_agent_two_trainers.py"], + args = ["--as-test", "--torch", "--stop-reward=70"] ) py_test( - name = "examples/nested_action_spaces_ppo", + name = "examples/multi_agent_two_trainers_mixed_torch_tf", + main = "examples/multi_agent_two_trainers.py", + tags = ["examples", "examples_M"], + size = "small", + srcs = ["examples/multi_agent_two_trainers.py"], + args = ["--as-test", "--mixed-torch-tf", "--stop-reward=70"] +) + +py_test( + name = "examples/nested_action_spaces_ppo_tf", main = "examples/nested_action_spaces.py", tags = ["examples", "examples_N"], size = "medium", srcs = ["examples/nested_action_spaces.py"], - args = ["--stop=-500", "--run=PPO"] + args = ["--as-test", "--stop-reward=-600", "--run=PPO"] ) py_test( - name = "examples/parametric_actions_cartpole_pg", + name = "examples/nested_action_spaces_ppo_torch", + main = "examples/nested_action_spaces.py", + tags = ["examples", "examples_N"], + size = "medium", + srcs = ["examples/nested_action_spaces.py"], + args = ["--as-test", "--torch", "--stop-reward=-600", "--run=PPO"] +) + +py_test( + name = "examples/parametric_actions_cartpole_pg_tf", main = "examples/parametric_actions_cartpole.py", tags = ["examples", "examples_P"], size = "medium", srcs = ["examples/parametric_actions_cartpole.py"], - args = ["--run=PG", "--stop=50"] + args = ["--as-test", "--stop-reward=60.0", "--run=PG"] ) py_test( - name = "examples/parametric_actions_cartpole_ppo", + name = "examples/parametric_actions_cartpole_dqn_tf", main = "examples/parametric_actions_cartpole.py", tags = ["examples", "examples_P"], size = "medium", srcs = ["examples/parametric_actions_cartpole.py"], - args = ["--run=PPO", "--stop=50"] + args = ["--as-test", "--stop-reward=60.0", "--run=DQN"] ) py_test( - name = "examples/parametric_actions_cartpole_dqn", + name = "examples/parametric_actions_cartpole_pg_torch", + main = "examples/parametric_actions_cartpole.py", + tags = ["examples", "examples_P"], + size = "small", + srcs = ["examples/parametric_actions_cartpole.py"], + args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=PG"] +) + +py_test( + name = "examples/parametric_actions_cartpole_dqn_torch", main = "examples/parametric_actions_cartpole.py", tags = ["examples", "examples_P"], size = "medium", srcs = ["examples/parametric_actions_cartpole.py"], - args = ["--run=DQN", "--stop=50"] + args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=DQN"] ) py_test( @@ -1731,9 +1938,27 @@ py_test( args = ["--num-cpus=4"] ) +py_test( + name = "examples/rock_paper_scissors_multiagent_tf", + main = "examples/rock_paper_scissors_multiagent.py", + tags = ["examples", "examples_R"], + size = "medium", + srcs = ["examples/rock_paper_scissors_multiagent.py"], + args = ["--as-test"], +) + +py_test( + name = "examples/rock_paper_scissors_multiagent_torch", + main = "examples/rock_paper_scissors_multiagent.py", + tags = ["examples", "examples_R"], + size = "medium", + srcs = ["examples/rock_paper_scissors_multiagent.py"], + args = ["--as-test", "--torch"], +) + sh_test( name = "examples/serving/test_local_inference", - tags = ["examples", "examples_L", "exclusive"], + tags = ["examples", "examples_S", "exclusive"], size = "medium", srcs = ["examples/serving/test_local_inference.sh"], data = glob(["examples/serving/*.py"]), @@ -1741,27 +1966,82 @@ sh_test( sh_test( name = "examples/serving/test_remote_inference", - tags = ["examples", "examples_R", "exclusive"], + tags = ["examples", "examples_S", "exclusive"], size = "medium", srcs = ["examples/serving/test_remote_inference.sh"], data = glob(["examples/serving/*.py"]), ) py_test( - name = "examples/rock_paper_scissors_multiagent", - main = "examples/rock_paper_scissors_multiagent.py", - tags = ["examples", "examples_R"], - size = "large", - srcs = ["examples/rock_paper_scissors_multiagent.py"], - args = ["--stop=200"], + name = "examples/two_trainer_workflow_tf", + main = "examples/two_trainer_workflow.py", + tags = ["examples", "examples_T"], + size = "small", + srcs = ["examples/two_trainer_workflow.py"], + args = ["--as-test", "--stop-reward=100.0"] ) py_test( - name = "examples/twostep_game_maddpg", main = "examples/twostep_game.py", + name = "examples/two_trainer_workflow_torch", + main = "examples/two_trainer_workflow.py", + tags = ["examples", "examples_T"], + size = "small", + srcs = ["examples/two_trainer_workflow.py"], + args = ["--as-test", "--torch", "--stop-reward=100.0"] +) + +py_test( + name = "examples/two_trainer_workflow_mixed_torch_tf", + main = "examples/two_trainer_workflow.py", + tags = ["examples", "examples_T"], + size = "small", + srcs = ["examples/two_trainer_workflow.py"], + args = ["--as-test", "--mixed-torch-tf", "--stop-reward=100.0"] +) + +py_test( + name = "examples/twostep_game_maddpg", + main = "examples/twostep_game.py", tags = ["examples", "examples_T"], size = "large", srcs = ["examples/twostep_game.py"], - args = ["--stop=2000", "--run=contrib/MADDPG"] + args = ["--stop-timesteps=2000", "--run=contrib/MADDPG"] +) + +py_test( + name = "examples/twostep_game_pg_tf", + main = "examples/twostep_game.py", + tags = ["examples", "examples_T"], + size = "medium", + srcs = ["examples/twostep_game.py"], + args = ["--as-test", "--stop-reward=7", "--run=PG"] +) + +py_test( + name = "examples/twostep_game_pg_torch", + main = "examples/twostep_game.py", + tags = ["examples", "examples_T"], + size = "medium", + srcs = ["examples/twostep_game.py"], + args = ["--as-test", "--torch", "--stop-reward=7", "--run=PG"] +) + +py_test( + name = "examples/twostep_game_qmix", + main = "examples/twostep_game.py", + tags = ["examples", "examples_T"], + size = "medium", + srcs = ["examples/twostep_game.py"], + args = ["--stop-timesteps=2000", "--run=QMIX"] +) + +py_test( + name = "examples/twostep_game_apex_qmix", + main = "examples/twostep_game.py", + tags = ["examples", "examples_T"], + size = "medium", + srcs = ["examples/twostep_game.py"], + args = ["--stop-timesteps=2000", "--run=APEX_QMIX", "--num-cpus=4"] ) py_test( @@ -1770,7 +2050,7 @@ py_test( tags = ["examples", "examples_T"], size = "small", srcs = ["contrib/bandits/examples/simple_context_bandit.py"], - args = ["--stop-at-reward=10", "--run=contrib/LinTS"], + args = ["--as-test", "--stop-reward=10", "--run=contrib/LinTS"], ) py_test( @@ -1779,29 +2059,5 @@ py_test( tags = ["examples", "examples_U"], size = "small", srcs = ["contrib/bandits/examples/simple_context_bandit.py"], - args = ["--stop-at-reward=10", "--run=contrib/LinUCB"], -) - -py_test( - name = "examples/twostep_game_pg", main = "examples/twostep_game.py", - tags = ["examples", "examples_T"], - size = "medium", - srcs = ["examples/twostep_game.py"], - args = ["--stop=2000", "--run=PG"] -) - -py_test( - name = "examples/twostep_game_qmix", main = "examples/twostep_game.py", - tags = ["examples", "examples_T"], - size = "medium", - srcs = ["examples/twostep_game.py"], - args = ["--stop=2000", "--run=QMIX"] -) - -py_test( - name = "examples/twostep_game_apex_qmix", main = "examples/twostep_game.py", - tags = ["examples", "examples_T"], - size = "medium", - srcs = ["examples/twostep_game.py"], - args = ["--stop=2000", "--run=APEX_QMIX", "--num-cpus=4"] + args = ["--as-test", "--stop-reward=10", "--run=contrib/LinUCB"], ) diff --git a/rllib/agents/a3c/a3c_torch_policy.py b/rllib/agents/a3c/a3c_torch_policy.py index 5babc3e8b..3b3439aa1 100644 --- a/rllib/agents/a3c/a3c_torch_policy.py +++ b/rllib/agents/a3c/a3c_torch_policy.py @@ -59,8 +59,13 @@ def apply_grad_clipping(policy, optimizer, loss): info = {} if policy.config["grad_clip"]: for param_group in optimizer.param_groups: - info["grad_gnorm"] = nn.utils.clip_grad_norm_( - param_group["params"], policy.config["grad_clip"]) + # Make sure we only pass params with grad != None into torch + # clip_grad_norm_. Would fail otherwise. + params = list( + filter(lambda p: p.grad is not None, param_group["params"])) + if params: + info["grad_gnorm"] = nn.utils.clip_grad_norm_( + params, policy.config["grad_clip"]) return info diff --git a/rllib/agents/ddpg/ddpg_torch_model.py b/rllib/agents/ddpg/ddpg_torch_model.py index aeceb6ef0..94ff781eb 100644 --- a/rllib/agents/ddpg/ddpg_torch_model.py +++ b/rllib/agents/ddpg/ddpg_torch_model.py @@ -45,9 +45,9 @@ class DDPGTorchModel(TorchModelV2, nn.Module): only defines the layers for the output heads. Those layers for forward() should be defined in subclasses of DDPGTorchModel. """ - TorchModelV2.__init__(self, obs_space, action_space, num_outputs, - model_config, name) nn.Module.__init__(self) + super(DDPGTorchModel, self).__init__(obs_space, action_space, + num_outputs, model_config, name) self.bounded = np.logical_and(action_space.bounded_above, action_space.bounded_below).any() @@ -58,7 +58,7 @@ class DDPGTorchModel(TorchModelV2, nn.Module): # Build the policy network. self.policy_model = nn.Sequential() - ins = obs_space.shape[-1] + ins = num_outputs self.obs_ins = ins activation = get_activation_fn( actor_hidden_activation, framework="torch") diff --git a/rllib/contrib/bandits/examples/simple_context_bandit.py b/rllib/contrib/bandits/examples/simple_context_bandit.py index 9ea5357f9..35f4fb01d 100644 --- a/rllib/contrib/bandits/examples/simple_context_bandit.py +++ b/rllib/contrib/bandits/examples/simple_context_bandit.py @@ -1,16 +1,20 @@ """A very simple contextual bandit example with 3 arms.""" import argparse -import random -import numpy as np import gym from gym.spaces import Discrete, Box +import numpy as np +import random from ray import tune +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() -parser.add_argument("--stop-at-reward", type=float, default=10) parser.add_argument("--run", type=str, default="contrib/LinUCB") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=10.0) class SimpleContextualBandit(gym.Env): @@ -37,11 +41,18 @@ class SimpleContextualBandit(gym.Env): if __name__ == "__main__": args = parser.parse_args() - tune.run( - args.run, - stop={ - "episode_reward_mean": args.stop_at_reward, - }, - config={ - "env": SimpleContextualBandit, - }) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + config = { + "env": SimpleContextualBandit, + } + + results = tune.run(args.run, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) diff --git a/rllib/examples/autoregressive_action_dist.py b/rllib/examples/autoregressive_action_dist.py index bc91f147f..83c222d44 100644 --- a/rllib/examples/autoregressive_action_dist.py +++ b/rllib/examples/autoregressive_action_dist.py @@ -10,187 +10,56 @@ pattern, and a custom action distribution class that leverages that model. This examples shows both. """ -from gym.spaces import Discrete, Tuple import argparse import ray from ray import tune from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv +from ray.rllib.examples.models.autoregressive_action_model import \ + AutoregressiveActionModel, TorchAutoregressiveActionModel +from ray.rllib.examples.models.autoregressive_action_dist import \ + BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution -from ray.rllib.models.tf.misc import normc_initializer -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils.framework import try_import_tf - -tf = try_import_tf() +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="PPO") # try PG, PPO, IMPALA -parser.add_argument("--stop", type=int, default=200) +parser.add_argument("--torch", action="store_true") parser.add_argument("--num-cpus", type=int, default=0) - - -class BinaryAutoregressiveOutput(ActionDistribution): - """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)""" - - @staticmethod - def required_model_output_shape(self, model_config): - return 16 # controls model output feature vector size - - def deterministic_sample(self): - # first, sample a1 - a1_dist = self._a1_distribution() - a1 = a1_dist.deterministic_sample() - - # sample a2 conditioned on a1 - a2_dist = self._a2_distribution(a1) - a2 = a2_dist.deterministic_sample() - self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) - - # return the action tuple - return (a1, a2) - - def sample(self): - # first, sample a1 - a1_dist = self._a1_distribution() - a1 = a1_dist.sample() - - # sample a2 conditioned on a1 - a2_dist = self._a2_distribution(a1) - a2 = a2_dist.sample() - self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2) - - # return the action tuple - return (a1, a2) - - def logp(self, actions): - a1, a2 = actions[:, 0], actions[:, 1] - a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) - a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) - return ( - Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2)) - - def sampled_action_logp(self): - return tf.exp(self._action_logp) - - def entropy(self): - a1_dist = self._a1_distribution() - a2_dist = self._a2_distribution(a1_dist.sample()) - return a1_dist.entropy() + a2_dist.entropy() - - def kl(self, other): - a1_dist = self._a1_distribution() - a1_terms = a1_dist.kl(other._a1_distribution()) - - a1 = a1_dist.sample() - a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1)) - return a1_terms + a2_terms - - def _a1_distribution(self): - BATCH = tf.shape(self.inputs)[0] - a1_logits, _ = self.model.action_model( - [self.inputs, tf.zeros((BATCH, 1))]) - a1_dist = Categorical(a1_logits) - return a1_dist - - def _a2_distribution(self, a1): - a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) - _, a2_logits = self.model.action_model([self.inputs, a1_vec]) - a2_dist = Categorical(a2_logits) - return a2_dist - - -class AutoregressiveActionsModel(TFModelV2): - """Implements the `.action_model` branch required above.""" - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - super(AutoregressiveActionsModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name) - if action_space != Tuple([Discrete(2), Discrete(2)]): - raise ValueError( - "This model only supports the [2, 2] action space") - - # Inputs - obs_input = tf.keras.layers.Input( - shape=obs_space.shape, name="obs_input") - a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input") - ctx_input = tf.keras.layers.Input( - shape=(num_outputs, ), name="ctx_input") - - # Output of the model (normally 'logits', but for an autoregressive - # dist this is more like a context/feature layer encoding the obs) - context = tf.keras.layers.Dense( - num_outputs, - name="hidden", - activation=tf.nn.tanh, - kernel_initializer=normc_initializer(1.0))(obs_input) - - # V(s) - value_out = tf.keras.layers.Dense( - 1, - name="value_out", - activation=None, - kernel_initializer=normc_initializer(0.01))(context) - - # P(a1 | obs) - a1_logits = tf.keras.layers.Dense( - 2, - name="a1_logits", - activation=None, - kernel_initializer=normc_initializer(0.01))(ctx_input) - - # P(a2 | a1) - # --note: typically you'd want to implement P(a2 | a1, obs) as follows: - # a2_context = tf.keras.layers.Concatenate(axis=1)( - # [ctx_input, a1_input]) - a2_context = a1_input - a2_hidden = tf.keras.layers.Dense( - 16, - name="a2_hidden", - activation=tf.nn.tanh, - kernel_initializer=normc_initializer(1.0))(a2_context) - a2_logits = tf.keras.layers.Dense( - 2, - name="a2_logits", - activation=None, - kernel_initializer=normc_initializer(0.01))(a2_hidden) - - # Base layers - self.base_model = tf.keras.Model(obs_input, [context, value_out]) - self.register_variables(self.base_model.variables) - self.base_model.summary() - - # Autoregressive action sampler - self.action_model = tf.keras.Model([ctx_input, a1_input], - [a1_logits, a2_logits]) - self.action_model.summary() - self.register_variables(self.action_model.variables) - - def forward(self, input_dict, state, seq_lens): - context, self._value_out = self.base_model(input_dict["obs"]) - return context, state - - def value_function(self): - return tf.reshape(self._value_out, [-1]) - +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=200) if __name__ == "__main__": args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None) - ModelCatalog.register_custom_model("autoregressive_model", - AutoregressiveActionsModel) - ModelCatalog.register_custom_action_dist("binary_autoreg_output", - BinaryAutoregressiveOutput) - tune.run( - args.run, - stop={"episode_reward_mean": args.stop}, - config={ - "env": CorrelatedActionsEnv, - "gamma": 0.5, - "num_gpus": 0, - "model": { - "custom_model": "autoregressive_model", - "custom_action_dist": "binary_autoreg_output", - }, - }) + ModelCatalog.register_custom_model( + "autoregressive_model", TorchAutoregressiveActionModel + if args.torch else AutoregressiveActionModel) + ModelCatalog.register_custom_action_dist( + "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution + if args.torch else BinaryAutoregressiveDistribution) + + config = { + "env": CorrelatedActionsEnv, + "gamma": 0.5, + "num_gpus": 0, + "model": { + "custom_model": "autoregressive_model", + "custom_action_dist": "binary_autoreg_dist", + }, + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(args.run, stop=stop, config=config) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/batch_norm_model.py b/rllib/examples/batch_norm_model.py index 22a7af8e6..cd15513a6 100644 --- a/rllib/examples/batch_norm_model.py +++ b/rllib/examples/batch_norm_model.py @@ -4,148 +4,28 @@ import argparse import ray from ray import tune +from ray.rllib.examples.models.batch_norm_model import BatchNormModel, \ + TorchBatchNormModel from ray.rllib.models import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.misc import normc_initializer -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.utils import try_import_tf -from ray.rllib.utils.annotations import override +from ray.rllib.utils.test_utils import check_learning_achieved tf = try_import_tf() parser = argparse.ArgumentParser() -parser.add_argument("--num-iters", type=int, default=200) parser.add_argument("--run", type=str, default="PPO") - - -class BatchNormModel(TFModelV2): - """Example of a TFModelV2 that is built w/o using tf.keras. - - NOTE: This example does not work when using a keras-based TFModelV2 due - to a bug in keras related to missing values for input placeholders, even - though these input values have been provided in a forward pass through the - actual keras Model. - - All Model logic (layers) is defined in the `forward` method (incl. - the batch_normalization layers). Also, all variables are registered - (only once) at the end of `forward`, so an optimizer knows which tensors - to train on. A standard `value_function` override is used. - """ - capture_index = 0 - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - super().__init__(obs_space, action_space, num_outputs, model_config, - name) - # Have we registered our vars yet (see `forward`)? - self._registered = False - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - last_layer = input_dict["obs"] - hiddens = [256, 256] - with tf.variable_scope("model", reuse=tf.AUTO_REUSE): - for i, size in enumerate(hiddens): - last_layer = tf.layers.dense( - last_layer, - size, - kernel_initializer=normc_initializer(1.0), - activation=tf.nn.tanh, - name="fc{}".format(i)) - # Add a batch norm layer - last_layer = tf.layers.batch_normalization( - last_layer, - training=input_dict["is_training"], - name="bn_{}".format(i)) - - output = tf.layers.dense( - last_layer, - self.num_outputs, - kernel_initializer=normc_initializer(0.01), - activation=None, - name="out") - self._value_out = tf.layers.dense( - last_layer, - 1, - kernel_initializer=normc_initializer(1.0), - activation=None, - name="vf") - if not self._registered: - self.register_variables( - tf.get_collection( - tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+")) - self._registered = True - - return output, [] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - - -class KerasBatchNormModel(TFModelV2): - """Keras version of above BatchNormModel with exactly the same structure. - - IMORTANT NOTE: This model will not work with PPO due to a bug in keras - that surfaces when having more than one input placeholder (here: `inputs` - and `is_training`) AND using the `make_tf_callable` helper (e.g. used by - PPO), in which auto-placeholders are generated, then passed through the - tf.keras. models.Model. In this last step, the connection between 1) the - provided value in the auto-placeholder and 2) the keras `is_training` - Input is broken and keras complains. - Use the above `BatchNormModel` (a non-keras based TFModelV2), instead. - """ - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - super().__init__(obs_space, action_space, num_outputs, model_config, - name) - inputs = tf.keras.layers.Input(shape=obs_space.shape, name="inputs") - is_training = tf.keras.layers.Input( - shape=(), dtype=tf.bool, batch_size=1, name="is_training") - last_layer = inputs - hiddens = [256, 256] - for i, size in enumerate(hiddens): - label = "fc{}".format(i) - last_layer = tf.keras.layers.Dense( - units=size, - kernel_initializer=normc_initializer(1.0), - activation=tf.nn.tanh, - name=label)(last_layer) - # Add a batch norm layer - last_layer = tf.keras.layers.BatchNormalization()( - last_layer, training=is_training[0]) - output = tf.keras.layers.Dense( - units=self.num_outputs, - kernel_initializer=normc_initializer(0.01), - activation=None, - name="fc_out")(last_layer) - value_out = tf.keras.layers.Dense( - units=1, - kernel_initializer=normc_initializer(0.01), - activation=None, - name="value_out")(last_layer) - - self.base_model = tf.keras.models.Model( - inputs=[inputs, is_training], outputs=[output, value_out]) - self.register_variables(self.base_model.variables) - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - out, self._value_out = self.base_model( - [input_dict["obs"], input_dict["is_training"]]) - return out, [] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--torch", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=150) if __name__ == "__main__": args = parser.parse_args() - ray.init() + ray.init(local_mode=True) - ModelCatalog.register_custom_model("bn_model", BatchNormModel) + ModelCatalog.register_custom_model( + "bn_model", TorchBatchNormModel if args.torch else BatchNormModel) config = { "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0", @@ -153,10 +33,18 @@ if __name__ == "__main__": "custom_model": "bn_model", }, "num_workers": 0, + "use_pytorch": args.torch, } - tune.run( - args.run, - stop={"training_iteration": args.num_iters}, - config=config, - ) + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(args.run, stop=stop, config=config) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/cartpole_lstm.py b/rllib/examples/cartpole_lstm.py index 56ab3178d..48912dae9 100644 --- a/rllib/examples/cartpole_lstm.py +++ b/rllib/examples/cartpole_lstm.py @@ -1,13 +1,17 @@ import argparse from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() -parser.add_argument("--stop", type=int, default=200) -parser.add_argument("--torch", action="store_true") -parser.add_argument("--use-prev-action-reward", action="store_true") parser.add_argument("--run", type=str, default="PPO") parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--use-prev-action-reward", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=150.0) if __name__ == "__main__": import ray @@ -30,16 +34,24 @@ if __name__ == "__main__": }, } - tune.run( - args.run, - stop={"episode_reward_mean": args.stop}, - config=dict( - configs[args.run], **{ - "env": StatelessCartPole, - "model": { - "use_lstm": True, - "lstm_use_prev_action_reward": args.use_prev_action_reward, - }, - "use_pytorch": args.torch, - }), - ) + config = dict( + configs[args.run], **{ + "env": StatelessCartPole, + "model": { + "use_lstm": True, + "lstm_use_prev_action_reward": args.use_prev_action_reward, + }, + "use_pytorch": args.torch, + }) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(args.run, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/centralized_critic.py b/rllib/examples/centralized_critic.py index 4446d1586..4ed877be8 100644 --- a/rllib/examples/centralized_critic.py +++ b/rllib/examples/centralized_critic.py @@ -16,77 +16,53 @@ import argparse import numpy as np from gym.spaces import Discrete +import ray from ray import tune from ray.rllib.agents.ppo.ppo import PPOTrainer from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, KLCoeffMixin, \ - PPOLoss + PPOLoss as TFLoss +from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy, \ + KLCoeffMixin as TorchKLCoeffMixin, PPOLoss as TorchLoss from ray.rllib.evaluation.postprocessing import compute_advantages, \ Postprocessing -from ray.rllib.examples.twostep_game import TwoStepGame +from ray.rllib.examples.env.two_step_game import TwoStepGame +from ray.rllib.examples.models.centralized_critic_models import \ + CentralizedCriticModel, TorchCentralizedCriticModel from ray.rllib.models import ModelCatalog from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy import LearningRateSchedule, \ EntropyCoeffSchedule -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork +from ray.rllib.policy.torch_policy import LearningRateSchedule as TorchLR, \ + EntropyCoeffSchedule as TorchEntropyCoeffSchedule from ray.rllib.utils.explained_variance import explained_variance +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.test_utils import check_learning_achieved from ray.rllib.utils.tf_ops import make_tf_callable -from ray.rllib.utils import try_import_tf +from ray.rllib.utils.torch_ops import convert_to_torch_tensor tf = try_import_tf() +torch, nn = try_import_torch() OPPONENT_OBS = "opponent_obs" OPPONENT_ACTION = "opponent_action" parser = argparse.ArgumentParser() -parser.add_argument("--stop", type=int, default=100000) - - -class CentralizedCriticModel(TFModelV2): - """Multi-agent model that implements a centralized VF.""" - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - super(CentralizedCriticModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name) - # Base of the model - self.model = FullyConnectedNetwork(obs_space, action_space, - num_outputs, model_config, name) - self.register_variables(self.model.variables()) - - # Central VF maps (obs, opp_obs, opp_act) -> vf_pred - obs = tf.keras.layers.Input(shape=(6, ), name="obs") - opp_obs = tf.keras.layers.Input(shape=(6, ), name="opp_obs") - opp_act = tf.keras.layers.Input(shape=(2, ), name="opp_act") - concat_obs = tf.keras.layers.Concatenate(axis=1)( - [obs, opp_obs, opp_act]) - central_vf_dense = tf.keras.layers.Dense( - 16, activation=tf.nn.tanh, name="c_vf_dense")(concat_obs) - central_vf_out = tf.keras.layers.Dense( - 1, activation=None, name="c_vf_out")(central_vf_dense) - self.central_vf = tf.keras.Model( - inputs=[obs, opp_obs, opp_act], outputs=central_vf_out) - self.register_variables(self.central_vf.variables) - - def forward(self, input_dict, state, seq_lens): - return self.model.forward(input_dict, state, seq_lens) - - def central_value_function(self, obs, opponent_obs, opponent_actions): - return tf.reshape( - self.central_vf( - [obs, opponent_obs, - tf.one_hot(opponent_actions, 2)]), [-1]) - - def value_function(self): - return self.model.value_function() # not used +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=100) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=7.99) class CentralizedValueMixin: """Add method to evaluate the central value function from the model.""" def __init__(self): - self.compute_central_vf = make_tf_callable(self.get_session())( - self.model.central_value_function) + if not self.config["use_pytorch"]: + self.compute_central_vf = make_tf_callable(self.get_session())( + self.model.central_value_function) + else: + self.compute_central_vf = self.model.central_value_function # Grabs the opponent obs/act and includes it in the experience train_batch, @@ -95,7 +71,9 @@ def centralized_critic_postprocessing(policy, sample_batch, other_agent_batches=None, episode=None): - if policy.loss_initialized(): + pytorch = policy.config["use_pytorch"] + if (pytorch and hasattr(policy, "compute_central_vf")) or \ + (not pytorch and policy.loss_initialized()): assert other_agent_batches is not None [(_, opponent_batch)] = list(other_agent_batches.values()) @@ -104,11 +82,18 @@ def centralized_critic_postprocessing(policy, sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS] # overwrite default VF prediction with the central VF - sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf( - sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS], - sample_batch[OPPONENT_ACTION]) + if args.torch: + sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf( + convert_to_torch_tensor(sample_batch[SampleBatch.CUR_OBS]), + convert_to_torch_tensor(sample_batch[OPPONENT_OBS]), + convert_to_torch_tensor(sample_batch[OPPONENT_ACTION])). \ + detach().numpy() + else: + sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf( + sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS], + sample_batch[OPPONENT_ACTION]) else: - # policy hasn't initialized yet, use zeros + # Policy hasn't been initialized yet, use zeros. sample_batch[OPPONENT_OBS] = np.zeros_like( sample_batch[SampleBatch.CUR_OBS]) sample_batch[OPPONENT_ACTION] = np.zeros_like( @@ -141,7 +126,13 @@ def loss_with_central_critic(policy, model, dist_class, train_batch): train_batch[SampleBatch.CUR_OBS], train_batch[OPPONENT_OBS], train_batch[OPPONENT_ACTION]) - policy.loss_obj = PPOLoss( + func = TFLoss if not policy.config["use_pytorch"] else TorchLoss + adv = tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) \ + if not policy.config["use_pytorch"] else \ + torch.ones_like(train_batch[Postprocessing.ADVANTAGES], + dtype=torch.bool) + + policy.loss_obj = func( dist_class, model, train_batch[Postprocessing.VALUE_TARGETS], @@ -153,7 +144,7 @@ def loss_with_central_critic(policy, model, dist_class, train_batch): action_dist, policy.central_value_out, policy.kl_coeff, - tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool), + adv, entropy_coeff=policy.entropy_coeff, clip_param=policy.config["clip_param"], vf_clip_param=policy.config["vf_clip_param"], @@ -180,8 +171,8 @@ def central_vf_stats(policy, train_batch, grads): } -CCPPO = PPOTFPolicy.with_updates( - name="CCPPO", +CCPPOTFPolicy = PPOTFPolicy.with_updates( + name="CCPPOTFPolicy", postprocess_fn=centralized_critic_postprocessing, loss_fn=loss_with_central_critic, before_loss_init=setup_mixins, @@ -191,31 +182,64 @@ CCPPO = PPOTFPolicy.with_updates( CentralizedValueMixin ]) +CCPPOTorchPolicy = PPOTorchPolicy.with_updates( + name="CCPPOTorchPolicy", + postprocess_fn=centralized_critic_postprocessing, + loss_fn=loss_with_central_critic, + before_init=setup_mixins, + mixins=[ + TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin, + CentralizedValueMixin + ]) + + +def get_policy_class(config): + return CCPPOTorchPolicy if config["use_pytorch"] else CCPPOTFPolicy + + CCTrainer = PPOTrainer.with_updates( - name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None) + name="CCPPOTrainer", + default_policy=CCPPOTFPolicy, + get_policy_class=get_policy_class, +) if __name__ == "__main__": + ray.init(local_mode=True) args = parser.parse_args() - ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel) - tune.run( - CCTrainer, - stop={ - "timesteps_total": args.stop, - "episode_reward_mean": 7.99, + + ModelCatalog.register_custom_model( + "cc_model", TorchCentralizedCriticModel + if args.torch else CentralizedCriticModel) + + config = { + "env": TwoStepGame, + "batch_mode": "complete_episodes", + "eager": False, + "num_workers": 0, + "multiagent": { + "policies": { + "pol1": (None, Discrete(6), TwoStepGame.action_space, { + "use_pytorch": args.torch + }), + "pol2": (None, Discrete(6), TwoStepGame.action_space, { + "use_pytorch": args.torch + }), + }, + "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2", }, - config={ - "env": TwoStepGame, - "batch_mode": "complete_episodes", - "eager": False, - "num_workers": 0, - "multiagent": { - "policies": { - "pol1": (None, Discrete(6), TwoStepGame.action_space, {}), - "pol2": (None, Discrete(6), TwoStepGame.action_space, {}), - }, - "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2", - }, - "model": { - "custom_model": "cc_model", - }, - }) + "model": { + "custom_model": "cc_model", + }, + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(CCTrainer, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) diff --git a/rllib/examples/centralized_critic_2.py b/rllib/examples/centralized_critic_2.py index 89153d261..db4691eaf 100644 --- a/rllib/examples/centralized_critic_2.py +++ b/rllib/examples/centralized_critic_2.py @@ -10,64 +10,24 @@ modifies the policy to add a centralized value function. """ import numpy as np -from gym.spaces import Box, Dict, Discrete +from gym.spaces import Dict, Discrete import argparse from ray import tune from ray.rllib.agents.callbacks import DefaultCallbacks -from ray.rllib.examples.twostep_game import TwoStepGame +from ray.rllib.examples.models.centralized_critic_models import \ + YetAnotherCentralizedCriticModel, YetAnotherTorchCentralizedCriticModel +from ray.rllib.examples.env.two_step_game import TwoStepGame from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.utils import try_import_tf - -tf = try_import_tf() +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() -parser.add_argument("--stop", type=int, default=100000) - - -class CentralizedCriticModel(TFModelV2): - """Multi-agent model that implements a centralized VF. - - It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the - former of which can be used for computing actions (i.e., decentralized - execution), and the latter for optimization (i.e., centralized learning). - - This model has two parts: - - An action model that looks at just 'own_obs' to compute actions - - A value model that also looks at the 'opponent_obs' / 'opponent_action' - to compute the value (it does this by using the 'obs_flat' tensor). - """ - - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - super(CentralizedCriticModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name) - - self.action_model = FullyConnectedNetwork( - Box(low=0, high=1, shape=(6, )), # one-hot encoded Discrete(6) - action_space, - num_outputs, - model_config, - name + "_action") - self.register_variables(self.action_model.variables()) - - self.value_model = FullyConnectedNetwork(obs_space, action_space, 1, - model_config, name + "_vf") - self.register_variables(self.value_model.variables()) - - def forward(self, input_dict, state, seq_lens): - self._value_out, _ = self.value_model({ - "obs": input_dict["obs_flat"] - }, state, seq_lens) - return self.action_model({ - "obs": input_dict["obs"]["own_obs"] - }, state, seq_lens) - - def value_function(self): - return tf.reshape(self._value_out, [-1]) +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=100) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=7.99) class FillInActions(DefaultCallbacks): @@ -109,7 +69,11 @@ def central_critic_observer(agent_obs, **kw): if __name__ == "__main__": args = parser.parse_args() - ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel) + + ModelCatalog.register_custom_model( + "cc_model", YetAnotherTorchCentralizedCriticModel + if args.torch else YetAnotherCentralizedCriticModel) + action_space = Discrete(2) observer_space = Dict({ "own_obs": Discrete(6), @@ -118,26 +82,33 @@ if __name__ == "__main__": "opponent_obs": Discrete(6), "opponent_action": Discrete(2), }) - tune.run( - "PPO", - stop={ - "timesteps_total": args.stop, - "episode_reward_mean": 7.99, + + config = { + "env": TwoStepGame, + "batch_mode": "complete_episodes", + "callbacks": FillInActions, + "num_workers": 0, + "multiagent": { + "policies": { + "pol1": (None, observer_space, action_space, {}), + "pol2": (None, observer_space, action_space, {}), + }, + "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2", + "observation_fn": central_critic_observer, }, - config={ - "env": TwoStepGame, - "batch_mode": "complete_episodes", - "callbacks": FillInActions, - "num_workers": 0, - "multiagent": { - "policies": { - "pol1": (None, observer_space, action_space, {}), - "pol2": (None, observer_space, action_space, {}), - }, - "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2", - "observation_fn": central_critic_observer, - }, - "model": { - "custom_model": "cc_model", - }, - }) + "model": { + "custom_model": "cc_model", + }, + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run("PPO", config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) diff --git a/rllib/examples/custom_env.py b/rllib/examples/custom_env.py index 51de41c53..148e174b5 100644 --- a/rllib/examples/custom_env.py +++ b/rllib/examples/custom_env.py @@ -7,20 +7,32 @@ This example shows: You can visualize experiment results in ~/ray_results using TensorBoard. """ - +import argparse import numpy as np import gym -from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork from gym.spaces import Discrete, Box import ray from ray import tune -from ray.rllib.utils import try_import_tf from ray.tune import grid_search +from ray.rllib.models import ModelCatalog +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.test_utils import check_learning_achieved tf = try_import_tf() +torch, nn = try_import_torch() + +parser = argparse.ArgumentParser() +parser.add_argument("--run", type=str, default="PPO") +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=50) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=0.1) class SimpleCorridor(gym.Env): @@ -46,11 +58,11 @@ class SimpleCorridor(gym.Env): elif action == 1: self.cur_pos += 1 done = self.cur_pos >= self.end_pos - return [self.cur_pos], 1 if done else 0, done, {} + return [self.cur_pos], 1.0 if done else -0.1, done, {} class CustomModel(TFModelV2): - """Example of a custom model that just delegates to a fc-net.""" + """Example of a keras custom model that just delegates to an fc-net.""" def __init__(self, obs_space, action_space, num_outputs, model_config, name): @@ -67,26 +79,58 @@ class CustomModel(TFModelV2): return self.model.value_function() +class TorchCustomModel(TorchModelV2, nn.Module): + """Example of a PyTorch custom model that just delegates to a fc-net.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + TorchModelV2.__init__(self, obs_space, action_space, num_outputs, + model_config, name) + nn.Module.__init__(self) + + self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs, + model_config, name) + + def forward(self, input_dict, state, seq_lens): + input_dict["obs"] = input_dict["obs"].float() + fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens) + return fc_out, [] + + def value_function(self): + return torch.reshape(self.torch_sub_model.value_function(), [-1]) + + if __name__ == "__main__": + args = parser.parse_args() + ray.init() + # Can also register the env creator function explicitly with: # register_env("corridor", lambda config: SimpleCorridor(config)) - ray.init() - ModelCatalog.register_custom_model("my_model", CustomModel) - tune.run( - "PPO", - stop={ - "timesteps_total": 10000, + ModelCatalog.register_custom_model( + "my_model", TorchCustomModel if args.torch else CustomModel) + + config = { + "env": SimpleCorridor, # or "corridor" if registered above + "env_config": { + "corridor_length": 5, }, - config={ - "env": SimpleCorridor, # or "corridor" if registered above - "model": { - "custom_model": "my_model", - }, - "vf_share_layers": True, - "lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs - "num_workers": 1, # parallelism - "env_config": { - "corridor_length": 5, - }, + "model": { + "custom_model": "my_model", }, - ) + "vf_share_layers": True, + "lr": grid_search([1e-2, 1e-4, 1e-6]), # try different lrs + "num_workers": 1, # parallelism + "use_pytorch": args.torch + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(args.run, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/custom_eval.py b/rllib/examples/custom_eval.py index d95bb09b9..2d794c489 100644 --- a/rllib/examples/custom_eval.py +++ b/rllib/examples/custom_eval.py @@ -74,9 +74,9 @@ from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes from ray.rllib.examples.env.simple_corridor import SimpleCorridor parser = argparse.ArgumentParser() -parser.add_argument("--custom-eval", action="store_true") parser.add_argument("--num-cpus", type=int, default=0) parser.add_argument("--torch", action="store_true") +parser.add_argument("--no-custom-eval", action="store_true") def custom_eval_function(trainer, eval_workers): @@ -124,48 +124,51 @@ def custom_eval_function(trainer, eval_workers): if __name__ == "__main__": args = parser.parse_args() - if args.custom_eval: - eval_fn = custom_eval_function - else: + if args.no_custom_eval: eval_fn = None + else: + eval_fn = custom_eval_function ray.init(num_cpus=args.num_cpus or None) - tune.run( - "PG", - stop={ - "training_iteration": 10, + config = { + "env": SimpleCorridor, + "env_config": { + "corridor_length": 10, }, - config={ - "env": SimpleCorridor, + "horizon": 20, + "log_level": "INFO", + + # Training rollouts will be collected using just the learner + # process, but evaluation will be done in parallel with two + # workers. Hence, this run will use 3 CPUs total (1 for the + # learner + 2 more for evaluation workers). + "num_workers": 0, + "evaluation_num_workers": 2, + + # Optional custom eval function. + "custom_eval_function": eval_fn, + + # Enable evaluation, once per training iteration. + "evaluation_interval": 1, + + # Run 10 episodes each time evaluation runs. + "evaluation_num_episodes": 10, + + # Override the env config for evaluation. + "evaluation_config": { "env_config": { - "corridor_length": 10, + # Evaluate using LONGER corridor than trained on. + "corridor_length": 5, }, - "horizon": 20, - "log_level": "INFO", + }, + "use_pytorch": args.torch, + } - # Training rollouts will be collected using just the learner - # process, but evaluation will be done in parallel with two - # workers. Hence, this run will use 3 CPUs total (1 for the - # learner + 2 more for evaluation workers). - "num_workers": 0, - "evaluation_num_workers": 2, + stop = { + "training_iteration": 10, + } - # Optional custom eval function. - "custom_eval_function": eval_fn, + tune.run("PG", config=config, stop=stop) - # Enable evaluation, once per training iteration. - "evaluation_interval": 1, - - # Run 10 episodes each time evaluation runs. - "evaluation_num_episodes": 10, - - # Override the env config for evaluation. - "evaluation_config": { - "env_config": { - # Evaluate using LONGER corridor than trained on. - "corridor_length": 5, - }, - }, - "use_pytorch": args.torch, - }) + ray.shutdown() diff --git a/rllib/examples/custom_fast_model.py b/rllib/examples/custom_fast_model.py index 6c628488c..04e4e2cb9 100644 --- a/rllib/examples/custom_fast_model.py +++ b/rllib/examples/custom_fast_model.py @@ -4,48 +4,52 @@ Both the model and env are trivial (and super-fast), so they are useful for running perf microbenchmarks. """ +import argparse + import ray +import ray.tune as tune +from ray.tune import sample_from from ray.rllib.examples.env.fast_image_env import FastImageEnv -from ray.rllib.models import Model, ModelCatalog -from ray.tune import run_experiments, sample_from -from ray.rllib.utils import try_import_tf - -tf = try_import_tf() - - -class FastModel(Model): - def _build_layers_v2(self, input_dict, num_outputs, options): - bias = tf.get_variable( - dtype=tf.float32, - name="bias", - initializer=tf.zeros_initializer, - shape=()) - output = bias + tf.zeros([tf.shape(input_dict["obs"])[0], num_outputs]) - return output, output +from ray.rllib.examples.models.fast_model import FastModel, TorchFastModel +from ray.rllib.models import ModelCatalog +parser = argparse.ArgumentParser() +parser.add_argument("--num-cpus", type=int, default=2) +parser.add_argument("--torch", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-timesteps", type=int, default=100000) if __name__ == "__main__": - ray.init() - ModelCatalog.register_custom_model("fast_model", FastModel) - run_experiments({ - "demo": { - "run": "IMPALA", - "env": FastImageEnv, - "config": { - "compress_observations": True, - "model": { - "custom_model": "fast_model" - }, - "num_gpus": 0, - "num_workers": 2, - "num_envs_per_worker": 10, - "num_data_loader_buffers": 1, - "num_aggregation_workers": 1, - "broadcast_interval": 50, - "rollout_fragment_length": 100, - "train_batch_size": sample_from( - lambda spec: 1000 * max(1, spec.config.num_gpus)), - "fake_sampler": True, - }, + args = parser.parse_args() + ray.init(num_cpus=args.num_cpus or None) + + ModelCatalog.register_custom_model( + "fast_model", TorchFastModel if args.torch else FastModel) + + config = { + "env": FastImageEnv, + "compress_observations": True, + "model": { + "custom_model": "fast_model" }, - }) + "num_gpus": 0, + "num_workers": 2, + "num_envs_per_worker": 10, + "num_data_loader_buffers": 1, + "num_aggregation_workers": 1, + "broadcast_interval": 50, + "rollout_fragment_length": 100, + "train_batch_size": sample_from( + lambda spec: 1000 * max(1, spec.config.num_gpus)), + "fake_sampler": True, + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + } + + tune.run("IMPALA", config=config, stop=stop) + + ray.shutdown() diff --git a/rllib/examples/custom_keras_cnn_plus_rnn_model.py b/rllib/examples/custom_keras_cnn_plus_rnn_model.py deleted file mode 100644 index 35538c2a3..000000000 --- a/rllib/examples/custom_keras_cnn_plus_rnn_model.py +++ /dev/null @@ -1,115 +0,0 @@ -# Explains/tests Issues: -# https://github.com/ray-project/ray/issues/6928 -# https://github.com/ray-project/ray/issues/6732 - -from gym.spaces import Discrete, Box -import numpy as np - -from ray.rllib.agents.ppo import PPOTrainer -from ray.rllib.examples.random_env import RandomEnv -from ray.rllib.models import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2 -from ray.rllib.utils import try_import_tf -from ray.rllib.utils.annotations import override - -tf = try_import_tf() - -cnn_shape = (4, 4, 3) - - -class CustomModel(RecurrentTFModelV2): - def __init__(self, obs_space, action_space, num_outputs, model_config, - name): - super(CustomModel, self).__init__(obs_space, action_space, num_outputs, - model_config, name) - - self.cell_size = 16 - visual_size = cnn_shape[0] * cnn_shape[1] * cnn_shape[2] - - state_in_h = tf.keras.layers.Input(shape=(self.cell_size, ), name="h") - state_in_c = tf.keras.layers.Input(shape=(self.cell_size, ), name="c") - seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32) - - inputs = tf.keras.layers.Input( - shape=(None, visual_size), name="visual_inputs") - - input_visual = inputs - input_visual = tf.reshape( - input_visual, [-1, cnn_shape[0], cnn_shape[1], cnn_shape[2]]) - cnn_input = tf.keras.layers.Input(shape=cnn_shape, name="cnn_input") - - cnn_model = tf.keras.applications.mobilenet_v2.MobileNetV2( - alpha=1.0, - include_top=True, - weights=None, - input_tensor=cnn_input, - pooling=None) - vision_out = cnn_model(input_visual) - vision_out = tf.reshape( - vision_out, - [-1, tf.shape(inputs)[1], - vision_out.shape.as_list()[-1]]) - - lstm_out, state_h, state_c = tf.keras.layers.LSTM( - self.cell_size, - return_sequences=True, - return_state=True, - name="lstm")( - inputs=vision_out, - mask=tf.sequence_mask(seq_in), - initial_state=[state_in_h, state_in_c]) - - # Postprocess LSTM output with another hidden layer and compute values. - logits = tf.keras.layers.Dense( - self.num_outputs, - activation=tf.keras.activations.linear, - name="logits")(lstm_out) - values = tf.keras.layers.Dense( - 1, activation=None, name="values")(lstm_out) - - # Create the RNN model - self.rnn_model = tf.keras.Model( - inputs=[inputs, seq_in, state_in_h, state_in_c], - outputs=[logits, values, state_h, state_c]) - self.register_variables(self.rnn_model.variables) - self.rnn_model.summary() - - @override(RecurrentTFModelV2) - def forward_rnn(self, inputs, state, seq_lens): - model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] + - state) - return model_out, [h, c] - - @override(ModelV2) - def get_initial_state(self): - return [ - np.zeros(self.cell_size, np.float32), - np.zeros(self.cell_size, np.float32), - ] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - - -if __name__ == "__main__": - ModelCatalog.register_custom_model("my_model", CustomModel) - trainer = PPOTrainer( - env=RandomEnv, - config={ - # "eager": True, # <- should work for both eager or not - "model": { - "custom_model": "my_model", - "max_seq_len": 20, - }, - "vf_share_layers": True, - "num_workers": 0, # no parallelism - "env_config": { - "action_space": Discrete(2), - # Test a simple Tuple observation space. - "observation_space": Box( - 0.0, 1.0, shape=cnn_shape, dtype=np.float32) - } - }) - trainer.train() diff --git a/rllib/examples/custom_keras_model.py b/rllib/examples/custom_keras_model.py index f2f5dc614..57a30977d 100644 --- a/rllib/examples/custom_keras_model.py +++ b/rllib/examples/custom_keras_model.py @@ -17,7 +17,7 @@ tf = try_import_tf() parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="DQN") # Try PG, PPO, DQN parser.add_argument("--stop", type=int, default=200) -parser.add_argument("--use_vision_network", action="store_true") +parser.add_argument("--use-vision-network", action="store_true") parser.add_argument("--num-cpus", type=int, default=0) diff --git a/rllib/examples/custom_keras_rnn_model.py b/rllib/examples/custom_keras_rnn_model.py deleted file mode 100644 index 311215ef3..000000000 --- a/rllib/examples/custom_keras_rnn_model.py +++ /dev/null @@ -1,117 +0,0 @@ -"""Example of using a custom RNN keras model.""" - -import argparse -import numpy as np - -import ray -from ray import tune -from ray.tune.registry import register_env -from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv -from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv -from ray.rllib.models import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils import try_import_tf - -tf = try_import_tf() - -parser = argparse.ArgumentParser() -parser.add_argument("--run", type=str, default="PPO") -parser.add_argument("--env", type=str, default="RepeatAfterMeEnv") -parser.add_argument("--stop", type=int, default=90) -parser.add_argument("--num-cpus", type=int, default=0) - - -class MyKerasRNN(RecurrentTFModelV2): - """Example of using the Keras functional API to define a RNN model.""" - - def __init__(self, - obs_space, - action_space, - num_outputs, - model_config, - name, - hiddens_size=256, - cell_size=64): - super(MyKerasRNN, self).__init__(obs_space, action_space, num_outputs, - model_config, name) - self.cell_size = cell_size - - # Define input layers - input_layer = tf.keras.layers.Input( - shape=(None, obs_space.shape[0]), name="inputs") - state_in_h = tf.keras.layers.Input(shape=(cell_size, ), name="h") - state_in_c = tf.keras.layers.Input(shape=(cell_size, ), name="c") - seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32) - - # Preprocess observation with a hidden layer and send to LSTM cell - dense1 = tf.keras.layers.Dense( - hiddens_size, activation=tf.nn.relu, name="dense1")(input_layer) - lstm_out, state_h, state_c = tf.keras.layers.LSTM( - cell_size, return_sequences=True, return_state=True, name="lstm")( - inputs=dense1, - mask=tf.sequence_mask(seq_in), - initial_state=[state_in_h, state_in_c]) - - # Postprocess LSTM output with another hidden layer and compute values - logits = tf.keras.layers.Dense( - self.num_outputs, - activation=tf.keras.activations.linear, - name="logits")(lstm_out) - values = tf.keras.layers.Dense( - 1, activation=None, name="values")(lstm_out) - - # Create the RNN model - self.rnn_model = tf.keras.Model( - inputs=[input_layer, seq_in, state_in_h, state_in_c], - outputs=[logits, values, state_h, state_c]) - self.register_variables(self.rnn_model.variables) - self.rnn_model.summary() - - @override(RecurrentTFModelV2) - def forward_rnn(self, inputs, state, seq_lens): - model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] + - state) - return model_out, [h, c] - - @override(ModelV2) - def get_initial_state(self): - return [ - np.zeros(self.cell_size, np.float32), - np.zeros(self.cell_size, np.float32), - ] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - - -if __name__ == "__main__": - args = parser.parse_args() - ray.init(num_cpus=args.num_cpus or None) - ModelCatalog.register_custom_model("rnn", MyKerasRNN) - register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c)) - register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv()) - - config = { - "env": args.env, - "env_config": { - "repeat_delay": 2, - }, - "gamma": 0.9, - "num_workers": 0, - "num_envs_per_worker": 20, - "entropy_coeff": 0.001, - "num_sgd_iter": 5, - "vf_loss_coeff": 1e-5, - "model": { - "custom_model": "rnn", - "max_seq_len": 20, - }, - } - tune.run( - args.run, - config=config, - stop={"episode_reward_mean": args.stop}, - ) diff --git a/rllib/examples/custom_loss.py b/rllib/examples/custom_loss.py index 030b2a97b..4283075b2 100644 --- a/rllib/examples/custom_loss.py +++ b/rllib/examples/custom_loss.py @@ -16,17 +16,16 @@ import os import ray from ray import tune -from ray.rllib.models import Model, ModelCatalog -from ray.rllib.models.tf.tf_action_dist import Categorical -from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork -from ray.rllib.models.model import restore_original_dimensions -from ray.rllib.offline import JsonReader +from ray.rllib.examples.models.custom_loss_model import CustomLossModel, \ + TorchCustomLossModel +from ray.rllib.models import ModelCatalog from ray.rllib.utils import try_import_tf tf = try_import_tf() parser = argparse.ArgumentParser() -parser.add_argument("--iters", type=int, default=200) +parser.add_argument("--torch", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument( "--input-files", type=str, @@ -34,50 +33,6 @@ parser.add_argument( os.path.dirname(os.path.abspath(__file__)), "../tests/data/cartpole_small")) - -class CustomLossModel(Model): - """Custom model that adds an imitation loss on top of the policy loss.""" - - def _build_layers_v2(self, input_dict, num_outputs, options): - self.obs_in = input_dict["obs"] - with tf.variable_scope("shared", reuse=tf.AUTO_REUSE): - self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space, - self.action_space, num_outputs, - options) - return self.fcnet.outputs, self.fcnet.last_layer - - def custom_loss(self, policy_loss, loss_inputs): - # create a new input reader per worker - reader = JsonReader(self.options["custom_options"]["input_files"]) - input_ops = reader.tf_input_ops() - - # define a secondary loss by building a graph copy with weight sharing - obs = tf.cast(input_ops["obs"], tf.float32) - logits, _ = self._build_layers_v2({ - "obs": restore_original_dimensions(obs, self.obs_space) - }, self.num_outputs, self.options) - - # You can also add self-supervised losses easily by referencing tensors - # created during _build_layers_v2(). For example, an autoencoder-style - # loss can be added as follows: - # ae_loss = squared_diff( - # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) - print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) - - # compute the IL loss - action_dist = Categorical(logits, self.options) - self.policy_loss = policy_loss - self.imitation_loss = tf.reduce_mean( - -action_dist.logp(input_ops["actions"])) - return policy_loss + 10 * self.imitation_loss - - def custom_stats(self): - return { - "policy_loss": self.policy_loss, - "imitation_loss": self.imitation_loss, - } - - if __name__ == "__main__": ray.init() args = parser.parse_args() @@ -90,20 +45,23 @@ if __name__ == "__main__": input_dir = rllib_dir.absolute().joinpath(args.input_files) args.input_files = str(input_dir) - ModelCatalog.register_custom_model("custom_loss", CustomLossModel) - tune.run( - "PG", - stop={ - "training_iteration": args.iters, - }, - config={ - "env": "CartPole-v0", - "num_workers": 0, - "model": { - "custom_model": "custom_loss", - "custom_options": { - "input_files": args.input_files, - }, + ModelCatalog.register_custom_model( + "custom_loss", TorchCustomLossModel if args.torch else CustomLossModel) + + config = { + "env": "CartPole-v0", + "num_workers": 0, + "model": { + "custom_model": "custom_loss", + "custom_options": { + "input_files": args.input_files, }, }, - ) + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + } + + tune.run("PG", config=config, stop=stop) diff --git a/rllib/examples/custom_metrics_and_callbacks.py b/rllib/examples/custom_metrics_and_callbacks.py index 6a26f1d50..16e5f1f49 100644 --- a/rllib/examples/custom_metrics_and_callbacks.py +++ b/rllib/examples/custom_metrics_and_callbacks.py @@ -64,14 +64,14 @@ class MyCallbacks(DefaultCallbacks): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--num-iters", type=int, default=2000) + parser.add_argument("--stop-iters", type=int, default=2000) args = parser.parse_args() ray.init() trials = tune.run( "PG", stop={ - "training_iteration": args.num_iters, + "training_iteration": args.stop_iters, }, config={ "env": "CartPole-v0", diff --git a/rllib/examples/custom_metrics_and_callbacks_legacy.py b/rllib/examples/custom_metrics_and_callbacks_legacy.py index 4356b98f1..1473ebfe6 100644 --- a/rllib/examples/custom_metrics_and_callbacks_legacy.py +++ b/rllib/examples/custom_metrics_and_callbacks_legacy.py @@ -53,14 +53,14 @@ def on_postprocess_traj(info): if __name__ == "__main__": parser = argparse.ArgumentParser() - parser.add_argument("--num-iters", type=int, default=2000) + parser.add_argument("--stop-iters", type=int, default=2000) args = parser.parse_args() ray.init() trials = tune.run( "PG", stop={ - "training_iteration": args.num_iters, + "training_iteration": args.stop_iters, }, config={ "env": "CartPole-v0", diff --git a/rllib/examples/custom_rnn_model.py b/rllib/examples/custom_rnn_model.py new file mode 100644 index 000000000..432010dae --- /dev/null +++ b/rllib/examples/custom_rnn_model.py @@ -0,0 +1,62 @@ +"""Example of using a custom RNN keras model.""" + +import argparse + +import ray +from ray import tune +from ray.tune.registry import register_env +from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv +from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv +from ray.rllib.examples.models.rnn_model import RNNModel, TorchRNNModel +from ray.rllib.models import ModelCatalog +from ray.rllib.utils.test_utils import check_learning_achieved + +parser = argparse.ArgumentParser() +parser.add_argument("--run", type=str, default="PPO") +parser.add_argument("--env", type=str, default="RepeatAfterMeEnv") +parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--torch", action="store_true") +parser.add_argument("--stop-reward", type=float, default=90) +parser.add_argument("--stop-iters", type=int, default=100) +parser.add_argument("--stop-timesteps", type=int, default=100000) + +if __name__ == "__main__": + args = parser.parse_args() + + ray.init(num_cpus=args.num_cpus or None) + + ModelCatalog.register_custom_model( + "rnn", TorchRNNModel if args.torch else RNNModel) + register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c)) + register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv()) + + config = { + "env": args.env, + "env_config": { + "repeat_delay": 2, + }, + "gamma": 0.9, + "num_workers": 0, + "num_envs_per_worker": 20, + "entropy_coeff": 0.001, + "num_sgd_iter": 5, + "vf_loss_coeff": 1e-5, + "model": { + "custom_model": "rnn", + "max_seq_len": 20, + }, + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(args.run, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/custom_tf_policy.py b/rllib/examples/custom_tf_policy.py index 639f5c61c..aaf62f3f2 100644 --- a/rllib/examples/custom_tf_policy.py +++ b/rllib/examples/custom_tf_policy.py @@ -10,7 +10,7 @@ from ray.rllib.utils import try_import_tf tf = try_import_tf() parser = argparse.ArgumentParser() -parser.add_argument("--iters", type=int, default=200) +parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--num-cpus", type=int, default=0) @@ -47,7 +47,7 @@ if __name__ == "__main__": ray.init(num_cpus=args.num_cpus or None) tune.run( MyTrainer, - stop={"training_iteration": args.iters}, + stop={"training_iteration": args.stop_iters}, config={ "env": "CartPole-v0", "num_workers": 2, diff --git a/rllib/examples/custom_torch_policy.py b/rllib/examples/custom_torch_policy.py index ab85badd3..77d44da29 100644 --- a/rllib/examples/custom_torch_policy.py +++ b/rllib/examples/custom_torch_policy.py @@ -7,7 +7,7 @@ from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.torch_policy_template import build_torch_policy parser = argparse.ArgumentParser() -parser.add_argument("--iters", type=int, default=200) +parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--num-cpus", type=int, default=0) @@ -33,7 +33,7 @@ if __name__ == "__main__": ray.init(num_cpus=args.num_cpus or None) tune.run( MyTrainer, - stop={"training_iteration": args.iters}, + stop={"training_iteration": args.stop_iters}, config={ "env": "CartPole-v0", "num_workers": 2, diff --git a/rllib/examples/custom_torch_rnn_model.py b/rllib/examples/custom_torch_rnn_model.py deleted file mode 100644 index 5e676bd08..000000000 --- a/rllib/examples/custom_torch_rnn_model.py +++ /dev/null @@ -1,128 +0,0 @@ -import argparse - -import ray -from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv -from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv -from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole -from ray.rllib.models.preprocessors import get_preprocessor -from ray.rllib.models.torch.recurrent_torch_model import RecurrentTorchModel -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.utils.annotations import override -from ray.rllib.utils import try_import_torch -from ray.rllib.models import ModelCatalog -import ray.tune as tune - -torch, nn = try_import_torch() - -parser = argparse.ArgumentParser() -parser.add_argument("--run", type=str, default="PPO") -parser.add_argument("--env", type=str, default="repeat_initial") -parser.add_argument("--stop", type=int, default=90) -parser.add_argument("--num-cpus", type=int, default=0) -parser.add_argument("--fc-size", type=int, default=64) -parser.add_argument("--lstm-cell-size", type=int, default=256) - - -class RNNModel(RecurrentTorchModel): - def __init__(self, - obs_space, - action_space, - num_outputs, - model_config, - name, - fc_size=64, - lstm_state_size=256): - super().__init__(obs_space, action_space, num_outputs, model_config, - name) - - self.obs_size = get_preprocessor(obs_space)(obs_space).size - self.fc_size = fc_size - self.lstm_state_size = lstm_state_size - - # Build the Module from fc + LSTM + 2xfc (action + value outs). - self.fc1 = nn.Linear(self.obs_size, self.fc_size) - self.lstm = nn.LSTM( - self.fc_size, self.lstm_state_size, batch_first=True) - self.action_branch = nn.Linear(self.lstm_state_size, num_outputs) - self.value_branch = nn.Linear(self.lstm_state_size, 1) - # Store the value output to save an extra forward pass. - self._cur_value = None - - @override(ModelV2) - def get_initial_state(self): - # make hidden states on same device as model - h = [ - self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0), - self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0) - ] - return h - - @override(ModelV2) - def value_function(self): - assert self._cur_value is not None, "must call forward() first" - return self._cur_value - - @override(RecurrentTorchModel) - def forward_rnn(self, inputs, state, seq_lens): - """Feeds `inputs` (B x T x ..) through the Gru Unit. - - Returns the resulting outputs as a sequence (B x T x ...). - Values are stored in self._cur_value in simple (B) shape (where B - contains both the B and T dims!). - - Returns: - NN Outputs (B x T x ...) as sequence. - The state batches as a List of two items (c- and h-states). - """ - x = nn.functional.relu(self.fc1(inputs)) - lstm_out = self.lstm( - x, [torch.unsqueeze(state[0], 0), - torch.unsqueeze(state[1], 0)]) - action_out = self.action_branch(lstm_out[0]) - self._cur_value = torch.reshape(self.value_branch(lstm_out[0]), [-1]) - return action_out, [ - torch.squeeze(lstm_out[1][0], 0), - torch.squeeze(lstm_out[1][1], 0) - ] - - -if __name__ == "__main__": - args = parser.parse_args() - - ray.init(num_cpus=args.num_cpus or None) - ModelCatalog.register_custom_model("rnn", RNNModel) - tune.register_env( - "repeat_initial", lambda _: RepeatInitialObsEnv(episode_len=100)) - tune.register_env( - "repeat_after_me", lambda _: RepeatAfterMeEnv({"repeat_delay": 1})) - tune.register_env("stateless_cartpole", lambda _: StatelessCartPole()) - - config = { - "env": args.env, - "use_pytorch": True, - "num_workers": 0, - "num_envs_per_worker": 20, - "gamma": 0.9, - "entropy_coeff": 0.0001, - "model": { - "custom_model": "rnn", - "max_seq_len": 20, - "lstm_use_prev_action_reward": "store_true", - "custom_options": { - "fc_size": args.fc_size, - "lstm_state_size": args.lstm_cell_size, - } - }, - "lr": 3e-4, - "num_sgd_iter": 5, - "vf_loss_coeff": 0.0003, - } - - tune.run( - args.run, - stop={ - "episode_reward_mean": args.stop, - "timesteps_total": 100000 - }, - config=config, - ) diff --git a/rllib/examples/eager_execution.py b/rllib/examples/eager_execution.py index 772d1e5b0..6cd2fca88 100644 --- a/rllib/examples/eager_execution.py +++ b/rllib/examples/eager_execution.py @@ -4,70 +4,20 @@ import random import ray from ray import tune from ray.rllib.agents.trainer_template import build_trainer +from ray.rllib.examples.models.eager_model import EagerModel from ray.rllib.models import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 from ray.rllib.policy.sample_batch import SampleBatch from ray.rllib.policy.tf_policy_template import build_tf_policy -from ray.rllib.utils import try_import_tf -from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.test_utils import check_learning_achieved tf = try_import_tf() parser = argparse.ArgumentParser() -parser.add_argument("--iters", type=int, default=200) - - -class EagerModel(TFModelV2): - """Example of using embedded eager execution in a custom model. - - This shows how to use tf.py_function() to execute a snippet of TF code - in eager mode. Here the `self.forward_eager` method just prints out - the intermediate tensor for debug purposes, but you can in general - perform any TF eager operation in tf.py_function(). - """ - - def __init__(self, observation_space, action_space, num_outputs, - model_config, name): - super().__init__(observation_space, action_space, num_outputs, - model_config, name) - - inputs = tf.keras.layers.Input(shape=observation_space.shape) - self.fcnet = FullyConnectedNetwork( - obs_space=self.obs_space, - action_space=self.action_space, - num_outputs=self.num_outputs, - model_config=self.model_config, - name="fc1") - out, value_out = self.fcnet.base_model(inputs) - - def lambda_(x): - eager_out = tf.py_function(self.forward_eager, [x], tf.float32) - with tf.control_dependencies([eager_out]): - eager_out.set_shape(x.shape) - return eager_out - - out = tf.keras.layers.Lambda(lambda_)(out) - self.base_model = tf.keras.models.Model(inputs, [out, value_out]) - self.register_variables(self.base_model.variables) - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - out, self._value_out = self.base_model(input_dict["obs"], state, - seq_lens) - return out, [] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - - def forward_eager(self, feature_layer): - assert tf.executing_eagerly() - if random.random() > 0.99: - print("Eagerly printing the feature layer mean value", - tf.reduce_mean(feature_layer)) - return feature_layer +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-timesteps", type=int, default=100000) +parser.add_argument("--stop-reward", type=float, default=150) +parser.add_argument("--as-test", action="store_true") def policy_gradient_loss(policy, model, dist_class, train_batch): @@ -119,5 +69,14 @@ if __name__ == "__main__": "custom_model": "eager_model" }, } + stop = { + "timesteps_total": args.stop_timesteps, + "training_iteration": args.stop_iters, + "episode_reward_mean": args.stop_reward, + } - tune.run(MyTrainer, stop={"training_iteration": args.iters}, config=config) + results = tune.run(MyTrainer, stop=stop, config=config) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/env/rock_paper_scissors.py b/rllib/examples/env/rock_paper_scissors.py index bd27da7f0..0633d4308 100644 --- a/rllib/examples/env/rock_paper_scissors.py +++ b/rllib/examples/env/rock_paper_scissors.py @@ -15,9 +15,9 @@ class RockPaperScissors(MultiAgentEnv): SPOCK = 4 def __init__(self, config): - self.action_space = Discrete(3) - self.observation_space = Discrete(3) self.sheldon_cooper = config.get("sheldon_cooper", False) + self.action_space = Discrete(5 if self.sheldon_cooper else 3) + self.observation_space = Discrete(5 if self.sheldon_cooper else 3) self.player1 = "player1" self.player2 = "player2" self.last_move = None diff --git a/rllib/examples/hierarchical_training.py b/rllib/examples/hierarchical_training.py index 37f33883f..992a14fe2 100644 --- a/rllib/examples/hierarchical_training.py +++ b/rllib/examples/hierarchical_training.py @@ -28,13 +28,16 @@ import logging import ray from ray import tune +from ray.tune import function from ray.rllib.examples.env.windy_maze_env import WindyMazeEnv, \ HierarchicalWindyMazeEnv -from ray.tune import function +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() parser.add_argument("--flat", action="store_true") +parser.add_argument("--as-test", action="store_true") parser.add_argument("--torch", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) parser.add_argument("--stop-reward", type=float, default=0.0) parser.add_argument("--stop-timesteps", type=int, default=100000) @@ -45,8 +48,9 @@ if __name__ == "__main__": ray.init() stop = { - "episode_reward_mean": args.stop_reward, + "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, } if args.flat: @@ -92,15 +96,9 @@ if __name__ == "__main__": "use_pytorch": args.torch, } - results = tune.run( - "PPO", - stop=stop, - config=config, - ) + results = tune.run("PPO", stop=stop, config=config) - # Error if stop-reward not reached. - if results.trials[0].last_result["episode_reward_mean"] < \ - args.stop_reward: - raise ValueError("`stop-reward` of {} not reached!".format( - args.stop_reward)) - print("ok") + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/mobilenet_v2_with_lstm.py b/rllib/examples/mobilenet_v2_with_lstm.py new file mode 100644 index 000000000..36529069e --- /dev/null +++ b/rllib/examples/mobilenet_v2_with_lstm.py @@ -0,0 +1,58 @@ +# Explains/tests Issues: +# https://github.com/ray-project/ray/issues/6928 +# https://github.com/ray-project/ray/issues/6732 + +import argparse +from gym.spaces import Discrete, Box +import numpy as np + +from ray.rllib.agents.ppo import PPOTrainer +from ray.rllib.examples.env.random_env import RandomEnv +from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \ + MobileV2PlusRNNModel, TorchMobileV2PlusRNNModel +from ray.rllib.models import ModelCatalog +from ray.rllib.utils import try_import_tf + +tf = try_import_tf() + +cnn_shape = (4, 4, 3) +# The torch version of MobileNetV2 does channels first. +cnn_shape_torch = (3, 224, 224) + +parser = argparse.ArgumentParser() +parser.add_argument("--torch", action="store_true") + +if __name__ == "__main__": + args = parser.parse_args() + + # Register our custom model. + ModelCatalog.register_custom_model( + "my_model", TorchMobileV2PlusRNNModel + if args.torch else MobileV2PlusRNNModel) + + # Configure our Trainer. + config = { + "use_pytorch": args.torch, + "model": { + "custom_model": "my_model", + # Extra config passed to the custom model's c'tor as kwargs. + "custom_options": { + "cnn_shape": cnn_shape_torch if args.torch else cnn_shape, + }, + "max_seq_len": 20, + }, + "vf_share_layers": True, + "num_workers": 0, # no parallelism + "env_config": { + "action_space": Discrete(2), + # Test a simple Image observation space. + "observation_space": Box( + 0.0, + 1.0, + shape=cnn_shape_torch if args.torch else cnn_shape, + dtype=np.float32) + }, + } + + trainer = PPOTrainer(config=config, env=RandomEnv) + print(trainer.train()) diff --git a/rllib/examples/models/centralized_critic_models.py b/rllib/examples/models/centralized_critic_models.py index 0e8937510..77c40cd54 100644 --- a/rllib/examples/models/centralized_critic_models.py +++ b/rllib/examples/models/centralized_critic_models.py @@ -111,8 +111,10 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module): # Central VF maps (obs, opp_obs, opp_act) -> vf_pred input_size = 6 + 6 + 2 # obs + opp_obs + opp_act - self.central_vf_dense = SlimFC(input_size, 16, activation_fn=nn.Tanh) - self.central_vf_out = SlimFC(16, 1) + self.central_vf = nn.Sequential( + SlimFC(input_size, 16, activation_fn=nn.Tanh), + SlimFC(16, 1), + ) @override(ModelV2) def forward(self, input_dict, state, seq_lens): @@ -122,10 +124,9 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module): def central_value_function(self, obs, opponent_obs, opponent_actions): input_ = torch.cat([ obs, opponent_obs, - torch.nn.functional.one_hot(opponent_actions, 2) + torch.nn.functional.one_hot(opponent_actions, 2).float() ], 1) - return torch.reshape( - self.central_vf_out(self.central_vf_dense(input_)), [-1]) + return torch.reshape(self.central_vf(input_), [-1]) @override(ModelV2) def value_function(self): diff --git a/rllib/examples/models/custom_loss_model.py b/rllib/examples/models/custom_loss_model.py new file mode 100644 index 000000000..fd269acdb --- /dev/null +++ b/rllib/examples/models/custom_loss_model.py @@ -0,0 +1,164 @@ +from ray.rllib.models.model import Model, restore_original_dimensions +from ray.rllib.models.modelv2 import ModelV2 +from ray.rllib.models.tf.tf_action_dist import Categorical +from ray.rllib.models.tf.tf_modelv2 import TFModelV2 +from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork +from ray.rllib.models.torch.torch_action_dist import TorchCategorical +from ray.rllib.models.torch.torch_modelv2 import TorchModelV2 +from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC +from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.offline import JsonReader + +tf = try_import_tf() +torch, nn = try_import_torch() + + +class CustomLossModel(TFModelV2): + """Custom model that adds an imitation loss on top of the policy loss.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name): + super().__init__(obs_space, action_space, num_outputs, model_config, + name) + + self.fcnet = FullyConnectedNetwork( + self.obs_space, + self.action_space, + num_outputs, + model_config, + name="fcnet") + self.register_variables(self.fcnet.variables()) + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Delegate to our FCNet. + return self.fcnet(input_dict, state, seq_lens) + + @override(ModelV2) + def custom_loss(self, policy_loss, loss_inputs): + # Create a new input reader per worker. + reader = JsonReader(self.model_config["custom_options"]["input_files"]) + input_ops = reader.tf_input_ops() + + # Define a secondary loss by building a graph copy with weight sharing. + obs = restore_original_dimensions( + tf.cast(input_ops["obs"], tf.float32), self.obs_space) + logits, _ = self.forward({"obs": obs}, [], None) + + # You can also add self-supervised losses easily by referencing tensors + # created during _build_layers_v2(). For example, an autoencoder-style + # loss can be added as follows: + # ae_loss = squared_diff( + # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) + print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) + + # Compute the IL loss. + action_dist = Categorical(logits, self.model_config) + self.policy_loss = policy_loss + self.imitation_loss = tf.reduce_mean( + -action_dist.logp(input_ops["actions"])) + return policy_loss + 10 * self.imitation_loss + + def custom_stats(self): + return { + "policy_loss": self.policy_loss, + "imitation_loss": self.imitation_loss, + } + + +class DeprecatedCustomLossModelV1(Model): + """Model(V1) version of above custom-loss model.""" + + def _build_layers_v2(self, input_dict, num_outputs, options): + self.obs_in = input_dict["obs"] + with tf.variable_scope("shared", reuse=tf.AUTO_REUSE): + self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space, + self.action_space, num_outputs, + options) + return self.fcnet.outputs, self.fcnet.last_layer + + def custom_loss(self, policy_loss, loss_inputs): + # create a new input reader per worker + reader = JsonReader(self.options["custom_options"]["input_files"]) + input_ops = reader.tf_input_ops() + + # define a secondary loss by building a graph copy with weight sharing + obs = tf.cast(input_ops["obs"], tf.float32) + logits, _ = self._build_layers_v2({ + "obs": restore_original_dimensions(obs, self.obs_space) + }, self.num_outputs, self.options) + + # You can also add self-supervised losses easily by referencing tensors + # created during _build_layers_v2(). For example, an autoencoder-style + # loss can be added as follows: + # ae_loss = squared_diff( + # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) + print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) + + # compute the IL loss + action_dist = Categorical(logits, self.options) + self.policy_loss = policy_loss + self.imitation_loss = tf.reduce_mean( + -action_dist.logp(input_ops["actions"])) + return policy_loss + 10 * self.imitation_loss + + def custom_stats(self): + return { + "policy_loss": self.policy_loss, + "imitation_loss": self.imitation_loss, + } + + +class TorchCustomLossModel(TorchModelV2, nn.Module): + """PyTorch version of the CustomLossModel above.""" + + def __init__(self, obs_space, action_space, num_outputs, model_config, + name, input_files): + super().__init__(obs_space, action_space, num_outputs, model_config, + name) + nn.Module.__init__(self) + + self.input_files = input_files + self.fcnet = TorchFC( + self.obs_space, + self.action_space, + num_outputs, + model_config, + name="fcnet") + + @override(ModelV2) + def forward(self, input_dict, state, seq_lens): + # Delegate to our FCNet. + return self.fcnet(input_dict, state, seq_lens) + + @override(ModelV2) + def custom_loss(self, policy_loss, loss_inputs): + # Create a new input reader per worker. + reader = JsonReader(self.input_files) + input_ops = reader.tf_input_ops() + + # Define a secondary loss by building a graph copy with weight sharing. + obs = restore_original_dimensions( + tf.cast(input_ops["obs"], tf.float32), self.obs_space) + logits, _ = self.forward({"obs": obs}, [], None) + + # You can also add self-supervised losses easily by referencing tensors + # created during _build_layers_v2(). For example, an autoencoder-style + # loss can be added as follows: + # ae_loss = squared_diff( + # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) + print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) + + # Compute the IL loss. + action_dist = TorchCategorical(logits, self.model_config) + self.policy_loss = policy_loss + self.imitation_loss = torch.mean( + -action_dist.logp(input_ops["actions"])) + return policy_loss + 10 * self.imitation_loss + + def custom_stats(self): + return { + "policy_loss": self.policy_loss, + "imitation_loss": self.imitation_loss, + } diff --git a/rllib/examples/multi_agent_cartpole.py b/rllib/examples/multi_agent_cartpole.py index 68a931ea5..6f99d8791 100644 --- a/rllib/examples/multi_agent_cartpole.py +++ b/rllib/examples/multi_agent_cartpole.py @@ -16,11 +16,11 @@ import random import ray from ray import tune from ray.rllib.examples.env.multi_agent import MultiAgentCartPole +from ray.rllib.examples.models.shared_weights_model import \ + SharedWeightsModel1, SharedWeightsModel2, TorchSharedWeightsModel from ray.rllib.models import ModelCatalog -from ray.rllib.models.modelv2 import ModelV2 -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.rllib.utils import try_import_tf -from ray.rllib.utils.annotations import override +from ray.rllib.utils.framework import try_import_tf +from ray.rllib.utils.test_utils import check_learning_achieved tf = try_import_tf() @@ -28,89 +28,31 @@ parser = argparse.ArgumentParser() parser.add_argument("--num-agents", type=int, default=4) parser.add_argument("--num-policies", type=int, default=2) -parser.add_argument("--num-iters", type=int, default=20) +parser.add_argument("--stop-iters", type=int, default=20) +parser.add_argument("--stop-reward", type=float, default=150) +parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--simple", action="store_true") parser.add_argument("--num-cpus", type=int, default=0) - - -class CustomModel1(TFModelV2): - def __init__(self, observation_space, action_space, num_outputs, - model_config, name): - super().__init__(observation_space, action_space, num_outputs, - model_config, name) - - inputs = tf.keras.layers.Input(observation_space.shape) - # Example of (optional) weight sharing between two different policies. - # Here, we share the variables defined in the 'shared' variable scope - # by entering it explicitly with tf.AUTO_REUSE. This creates the - # variables for the 'fc1' layer in a global scope called 'shared' - # outside of the policy's normal variable scope. - with tf.variable_scope( - tf.VariableScope(tf.AUTO_REUSE, "shared"), - reuse=tf.AUTO_REUSE, - auxiliary_name_scope=False): - last_layer = tf.keras.layers.Dense( - units=64, activation=tf.nn.relu, name="fc1")(inputs) - output = tf.keras.layers.Dense( - units=num_outputs, activation=None, name="fc_out")(last_layer) - vf = tf.keras.layers.Dense( - units=1, activation=None, name="value_out")(last_layer) - self.base_model = tf.keras.models.Model(inputs, [output, vf]) - self.register_variables(self.base_model.variables) - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - out, self._value_out = self.base_model(input_dict["obs"]) - return out, [] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - - -class CustomModel2(TFModelV2): - def __init__(self, observation_space, action_space, num_outputs, - model_config, name): - super().__init__(observation_space, action_space, num_outputs, - model_config, name) - - inputs = tf.keras.layers.Input(observation_space.shape) - - # Weights shared with CustomModel1. - with tf.variable_scope( - tf.VariableScope(tf.AUTO_REUSE, "shared"), - reuse=tf.AUTO_REUSE, - auxiliary_name_scope=False): - last_layer = tf.keras.layers.Dense( - units=64, activation=tf.nn.relu, name="fc1")(inputs) - output = tf.keras.layers.Dense( - units=num_outputs, activation=None, name="fc_out")(last_layer) - vf = tf.keras.layers.Dense( - units=1, activation=None, name="value_out")(last_layer) - self.base_model = tf.keras.models.Model(inputs, [output, vf]) - self.register_variables(self.base_model.variables) - - @override(ModelV2) - def forward(self, input_dict, state, seq_lens): - out, self._value_out = self.base_model(input_dict["obs"]) - return out, [] - - @override(ModelV2) - def value_function(self): - return tf.reshape(self._value_out, [-1]) - +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--torch", action="store_true") if __name__ == "__main__": args = parser.parse_args() + ray.init(num_cpus=args.num_cpus or None) - ModelCatalog.register_custom_model("model1", CustomModel1) - ModelCatalog.register_custom_model("model2", CustomModel2) + # Register the models to use. + mod1 = TorchSharedWeightsModel if args.torch else SharedWeightsModel1 + mod2 = TorchSharedWeightsModel if args.torch else SharedWeightsModel2 + ModelCatalog.register_custom_model("model1", mod1) + ModelCatalog.register_custom_model("model2", mod2) + + # Get obs- and action Spaces. single_env = gym.make("CartPole-v0") obs_space = single_env.observation_space act_space = single_env.action_space - # Each policy can have a different configuration (including custom model) + # Each policy can have a different configuration (including custom model). def gen_policy(i): config = { "model": { @@ -120,28 +62,35 @@ if __name__ == "__main__": } return (None, obs_space, act_space, config) - # Setup PPO with an ensemble of `num_policies` different policies + # Setup PPO with an ensemble of `num_policies` different policies. policies = { "policy_{}".format(i): gen_policy(i) for i in range(args.num_policies) } policy_ids = list(policies.keys()) - tune.run( - "PPO", - stop={"training_iteration": args.num_iters}, - config={ - "env": MultiAgentCartPole, - "env_config": { - "num_agents": args.num_agents, - }, - "log_level": "DEBUG", - "simple_optimizer": args.simple, - "num_sgd_iter": 10, - "multiagent": { - "policies": policies, - "policy_mapping_fn": ( - lambda agent_id: random.choice(policy_ids)), - }, + config = { + "env": MultiAgentCartPole, + "env_config": { + "num_agents": args.num_agents, }, - ) + "log_level": "DEBUG", + "simple_optimizer": args.simple, + "num_sgd_iter": 10, + "multiagent": { + "policies": policies, + "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)), + }, + "use_pytorch": args.torch, + } + stop = { + "episode_reward_mean": args.stop_reward, + "timesteps_total": args.stop_timesteps, + "training_iteration": args.stop_iters, + } + + results = tune.run("PPO", stop=stop, config=config) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + ray.shutdown() diff --git a/rllib/examples/multi_agent_custom_policy.py b/rllib/examples/multi_agent_custom_policy.py index 96bb48fd7..a581c4638 100644 --- a/rllib/examples/multi_agent_custom_policy.py +++ b/rllib/examples/multi_agent_custom_policy.py @@ -18,32 +18,17 @@ import gym import ray from ray import tune -from ray.rllib.examples.env.multi_agent import MultiAgentCartPole -from ray.rllib.policy import Policy from ray.tune.registry import register_env +from ray.rllib.examples.env.multi_agent import MultiAgentCartPole +from ray.rllib.examples.policy.random_policy import RandomPolicy +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() -parser.add_argument("--num-iters", type=int, default=20) - - -class RandomPolicy(Policy): - """Hand-coded policy that returns random actions.""" - - def compute_actions(self, - obs_batch, - state_batches=None, - prev_action_batch=None, - prev_reward_batch=None, - info_batch=None, - episodes=None, - **kwargs): - """Compute actions on a batch of observations.""" - return [self.action_space.sample() for _ in obs_batch], [], {} - - def learn_on_batch(self, samples): - """No learning.""" - return {} - +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=20) +parser.add_argument("--stop-reward", type=float, default=150) +parser.add_argument("--stop-timesteps", type=int, default=100000) if __name__ == "__main__": args = parser.parse_args() @@ -56,18 +41,32 @@ if __name__ == "__main__": obs_space = single_env.observation_space act_space = single_env.action_space - tune.run( + stop = { + "training_iteration": args.stop_iters, + "episode_reward_mean": args.stop_reward, + "timesteps_total": args.stop_timesteps, + } + + results = tune.run( "PG", - stop={"training_iteration": args.num_iters}, + stop=stop, config={ "env": "multi_agent_cartpole", "multiagent": { "policies": { - "pg_policy": (None, obs_space, act_space, {}), + "pg_policy": (None, obs_space, act_space, { + "use_pytorch": args.torch + }), "random": (RandomPolicy, obs_space, act_space, {}), }, "policy_mapping_fn": ( lambda agent_id: ["pg_policy", "random"][agent_id % 2]), }, + "use_pytorch": args.torch, }, ) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/multi_agent_two_trainers.py b/rllib/examples/multi_agent_two_trainers.py index 2956bc767..77ffd74d0 100644 --- a/rllib/examples/multi_agent_two_trainers.py +++ b/rllib/examples/multi_agent_two_trainers.py @@ -12,19 +12,27 @@ import argparse import gym import ray -from ray.rllib.agents.dqn.dqn import DQNTrainer -from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy -from ray.rllib.agents.ppo.ppo import PPOTrainer -from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy +from ray.rllib.agents.dqn import DQNTrainer, DQNTFPolicy, DQNTorchPolicy +from ray.rllib.agents.ppo import PPOTrainer, PPOTFPolicy, PPOTorchPolicy from ray.rllib.examples.env.multi_agent import MultiAgentCartPole from ray.tune.logger import pretty_print from ray.tune.registry import register_env parser = argparse.ArgumentParser() -parser.add_argument("--num-iters", type=int, default=20) +# Use torch for both policies. +parser.add_argument("--torch", action="store_true") +# Mix PPO=tf and DQN=torch if set. +parser.add_argument("--mixed-torch-tf", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=20) +parser.add_argument("--stop-reward", type=float, default=50) +parser.add_argument("--stop-timesteps", type=int, default=100000) if __name__ == "__main__": args = parser.parse_args() + assert not (args.torch and args.mixed_torch_tf),\ + "Use either --torch or --mixed-torch-tf, not both!" + ray.init() # Simple environment with 4 independent cartpole entities @@ -37,8 +45,10 @@ if __name__ == "__main__": # You can also have multiple policies per trainer, but here we just # show one each for PPO and DQN. policies = { - "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}), - "dqn_policy": (DQNTFPolicy, obs_space, act_space, {}), + "ppo_policy": (PPOTorchPolicy if args.torch else PPOTFPolicy, + obs_space, act_space, {}), + "dqn_policy": (DQNTorchPolicy if args.torch or args.mixed_torch_tf else + DQNTFPolicy, obs_space, act_space, {}), } def policy_mapping_fn(agent_id): @@ -59,6 +69,7 @@ if __name__ == "__main__": # disable filters, otherwise we would need to synchronize those # as well to the DQN agent "observation_filter": "NoFilter", + "use_pytorch": args.torch, }) dqn_trainer = DQNTrainer( @@ -71,6 +82,7 @@ if __name__ == "__main__": }, "gamma": 0.95, "n_step": 3, + "use_pytorch": args.torch or args.mixed_torch_tf, }) # You should see both the printed X and Y approach 200 as this trains: @@ -78,17 +90,31 @@ if __name__ == "__main__": # policy_reward_mean: # dqn_policy: X # ppo_policy: Y - for i in range(args.num_iters): + for i in range(args.stop_iters): print("== Iteration", i, "==") # improve the DQN policy print("-- DQN --") - print(pretty_print(dqn_trainer.train())) + result_dqn = dqn_trainer.train() + print(pretty_print(result_dqn)) # improve the PPO policy print("-- PPO --") - print(pretty_print(ppo_trainer.train())) + result_ppo = ppo_trainer.train() + print(pretty_print(result_ppo)) + + # Test passed gracefully. + if args.as_test and \ + result_dqn["episode_reward_mean"] > args.stop_reward and \ + result_ppo["episode_reward_mean"] > args.stop_reward: + print("test passed (both agents above requested reward)") + quit(0) # swap weights to synchronize dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"])) ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"])) + + # Desired reward not reached. + if args.as_test: + raise ValueError("Desired reward ({}) not reached!".format( + args.stop_reward)) diff --git a/rllib/examples/nested_action_spaces.py b/rllib/examples/nested_action_spaces.py index a3d8ad0ba..96fb0a9ae 100644 --- a/rllib/examples/nested_action_spaces.py +++ b/rllib/examples/nested_action_spaces.py @@ -1,22 +1,20 @@ import argparse from gym.spaces import Dict, Tuple, Box, Discrete -import sys import ray +import ray.tune as tune from ray.tune.registry import register_env from ray.rllib.examples.env.nested_space_repeat_after_me_env import \ NestedSpaceRepeatAfterMeEnv -from ray.rllib.utils import try_import_tree -from ray.rllib.utils.framework import try_import_tf - -tf = try_import_tf() -tree = try_import_tree() +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() parser.add_argument("--run", type=str, default="PPO") parser.add_argument("--torch", action="store_true") -parser.add_argument("--stop", type=int, default=90) -parser.add_argument("--max-trainstop", type=int, default=90) +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-reward", type=float, default=0.0) +parser.add_argument("--stop-iters", type=int, default=100) +parser.add_argument("--stop-timesteps", type=int, default=100000) parser.add_argument("--num-cpus", type=int, default=0) if __name__ == "__main__": @@ -40,19 +38,23 @@ if __name__ == "__main__": }, "entropy_coeff": 0.00005, # We don't want high entropy in this Env. "gamma": 0.0, # No history in Env (bandit problem). - "lr": 0.0003, + "lr": 0.0005, "num_envs_per_worker": 20, - "num_sgd_iter": 20, + "num_sgd_iter": 4, "num_workers": 0, - "use_pytorch": args.torch, "vf_loss_coeff": 0.01, + "use_pytorch": args.torch, } - import ray.rllib.agents.ppo as ppo - trainer = ppo.PPOTrainer(config=config) - for _ in range(100): - results = trainer.train() - print(results) - if results["episode_reward_mean"] > args.stop: - sys.exit(0) # Learnt, exit gracefully. - sys.exit(1) # Done, but did not learn, exit with error. + stop = { + "training_iteration": args.stop_iters, + "episode_reward_mean": args.stop_reward, + "timesteps_total": args.stop_timesteps, + } + + results = tune.run(args.run, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/parametric_actions_cartpole.py b/rllib/examples/parametric_actions_cartpole.py index 62e7dae54..c83c588a3 100644 --- a/rllib/examples/parametric_actions_cartpole.py +++ b/rllib/examples/parametric_actions_cartpole.py @@ -15,83 +15,34 @@ Working configurations are given below. """ import argparse -from gym.spaces import Box import ray from ray import tune -from ray.rllib.agents.dqn.distributional_q_tf_model import \ - DistributionalQTFModel +from ray.tune.registry import register_env from ray.rllib.examples.env.parametric_actions_cartpole import \ ParametricActionsCartPole +from ray.rllib.examples.models.parametric_actions_model import \ + ParametricActionsModel, TorchParametricActionsModel from ray.rllib.models import ModelCatalog -from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork -from ray.rllib.models.tf.tf_modelv2 import TFModelV2 -from ray.tune.registry import register_env -from ray.rllib.utils import try_import_tf - -tf = try_import_tf() +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() -parser.add_argument("--stop", type=int, default=200) parser.add_argument("--run", type=str, default="PPO") - - -class ParametricActionsModel(DistributionalQTFModel, TFModelV2): - """Parametric action model that handles the dot product and masking. - - This assumes the outputs are logits for a single Categorical action dist. - Getting this to work with a more complex output (e.g., if the action space - is a tuple of several distributions) is also possible but left as an - exercise to the reader. - """ - - def __init__(self, - obs_space, - action_space, - num_outputs, - model_config, - name, - true_obs_shape=(4, ), - action_embed_size=2, - **kw): - super(ParametricActionsModel, self).__init__( - obs_space, action_space, num_outputs, model_config, name, **kw) - self.action_embed_model = FullyConnectedNetwork( - Box(-1, 1, shape=true_obs_shape), action_space, action_embed_size, - model_config, name + "_action_embed") - self.register_variables(self.action_embed_model.variables()) - - def forward(self, input_dict, state, seq_lens): - # Extract the available actions tensor from the observation. - avail_actions = input_dict["obs"]["avail_actions"] - action_mask = input_dict["obs"]["action_mask"] - - # Compute the predicted action embedding - action_embed, _ = self.action_embed_model({ - "obs": input_dict["obs"]["cart"] - }) - - # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the - # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE]. - intent_vector = tf.expand_dims(action_embed, 1) - - # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS]. - action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2) - - # Mask out invalid actions (use tf.float32.min for stability) - inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min) - return action_logits + inf_mask, state - - def value_function(self): - return self.action_embed_model.value_function() - +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=200) +parser.add_argument("--stop-reward", type=float, default=150.0) +parser.add_argument("--stop-timesteps", type=int, default=100000) if __name__ == "__main__": args = parser.parse_args() ray.init() - ModelCatalog.register_custom_model("pa_model", ParametricActionsModel) register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10)) + ModelCatalog.register_custom_model( + "pa_model", TorchParametricActionsModel + if args.torch else ParametricActionsModel) + if args.run == "DQN": cfg = { # TODO(ekl) we need to set these to prevent the masked values @@ -103,16 +54,25 @@ if __name__ == "__main__": } else: cfg = {} - tune.run( - args.run, - stop={ - "episode_reward_mean": args.stop, + + config = dict({ + "env": "pa_cartpole", + "model": { + "custom_model": "pa_model", }, - config=dict({ - "env": "pa_cartpole", - "model": { - "custom_model": "pa_model", - }, - "num_workers": 0, - }, **cfg), - ) + "num_workers": 0, + "use_pytorch": args.torch, + }, **cfg) + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(args.run, stop=stop, config=config) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/rock_paper_scissors_multiagent.py b/rllib/examples/rock_paper_scissors_multiagent.py index 1095b3272..ab9d712cc 100644 --- a/rllib/examples/rock_paper_scissors_multiagent.py +++ b/rllib/examples/rock_paper_scissors_multiagent.py @@ -8,99 +8,41 @@ This demonstrates running the following policies in competition: """ import argparse -import random from gym.spaces import Discrete +import random from ray import tune -from ray.rllib.agents.pg.pg import PGTrainer -from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy +from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy +from ray.rllib.agents.registry import get_agent_class from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors -from ray.rllib.policy.policy import Policy -from ray.rllib.utils import try_import_tf - -parser = argparse.ArgumentParser() -parser.add_argument("--stop", type=int, default=1000) +from ray.rllib.examples.policy.rock_paper_scissors_dummies import \ + BeatLastHeuristic, AlwaysSameHeuristic +from ray.rllib.utils.framework import try_import_tf, try_import_torch +from ray.rllib.utils.test_utils import check_learning_achieved tf = try_import_tf() +torch, _ = try_import_torch() + +parser = argparse.ArgumentParser() +parser.add_argument("--torch", action="store_true") +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--stop-iters", type=int, default=150) +parser.add_argument("--stop-reward", type=float, default=1000.0) +parser.add_argument("--stop-timesteps", type=int, default=100000) -class AlwaysSameHeuristic(Policy): - """Pick a random move and stick with it for the entire episode.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.exploration = self._create_exploration() - - def get_initial_state(self): - return [ - random.choice([ - RockPaperScissors.ROCK, RockPaperScissors.PAPER, - RockPaperScissors.SCISSORS - ]) - ] - - def compute_actions(self, - obs_batch, - state_batches=None, - prev_action_batch=None, - prev_reward_batch=None, - info_batch=None, - episodes=None, - **kwargs): - return state_batches[0], state_batches, {} - - def learn_on_batch(self, samples): - pass - - def get_weights(self): - pass - - def set_weights(self, weights): - pass - - -class BeatLastHeuristic(Policy): - """Play the move that would beat the last move of the opponent.""" - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.exploration = self._create_exploration() - - def compute_actions(self, - obs_batch, - state_batches=None, - prev_action_batch=None, - prev_reward_batch=None, - info_batch=None, - episodes=None, - **kwargs): - def successor(x): - if x[RockPaperScissors.ROCK] == 1: - return RockPaperScissors.PAPER - elif x[RockPaperScissors.PAPER] == 1: - return RockPaperScissors.SCISSORS - elif x[RockPaperScissors.SCISSORS] == 1: - return RockPaperScissors.ROCK - - return [successor(x) for x in obs_batch], [], {} - - def learn_on_batch(self, samples): - pass - - def get_weights(self): - pass - - def set_weights(self, weights): - pass - - -def run_same_policy(args): +def run_same_policy(args, stop): """Use the same policy for both agents (trivial case).""" + config = { + "env": RockPaperScissors, + "use_pytorch": args.torch, + } - tune.run( - "PG", - stop={"timesteps_total": args.stop}, - config={"env": RockPaperScissors}) + results = tune.run("PG", config=config, stop=stop) + + if args.as_test: + # Check vs 0.0 as we are playing a zero-sum game. + check_learning_achieved(results, 0.0) def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"): @@ -134,16 +76,35 @@ def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"): "learned": (None, Discrete(3), Discrete(3), { "model": { "use_lstm": use_lstm - } + }, + "use_pytorch": args.torch, }), }, "policy_mapping_fn": select_policy, }, + "use_pytorch": args.torch, } - tune.run(trainer, stop={"timesteps_total": args.stop}, config=config) + cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer + trainer_obj = cls(config=config) + env = trainer_obj.workers.local_worker().env + for _ in range(args.stop_iters): + results = trainer_obj.train() + print(results) + # Timesteps reached. + if results["timesteps_total"] > args.stop_timesteps: + break + # Reward (difference) reached -> all good, return. + elif env.player1_score - env.player2_score > args.stop_reward: + return + + # Reward (difference) not reached: Error if `as_test`. + if args.as_test: + raise ValueError( + "Desired reward difference ({}) not reached! Only got to {}.". + format(args.stop_reward, env.player1_score - env.player2_score)) -def run_with_custom_entropy_loss(args): +def run_with_custom_entropy_loss(args, stop): """Example of customizing the loss function of an existing policy. This performs about the same as the default loss does.""" @@ -151,24 +112,44 @@ def run_with_custom_entropy_loss(args): def entropy_policy_gradient_loss(policy, model, dist_class, train_batch): logits, _ = model.from_batch(train_batch) action_dist = dist_class(logits, model) - return (-0.1 * action_dist.entropy() - tf.reduce_mean( - action_dist.logp(train_batch["actions"]) * - train_batch["advantages"])) + if args.torch: + # required by PGTorchPolicy's stats fn. + policy.pi_err = torch.tensor([0.0]) + return torch.mean(-0.1 * action_dist.entropy() - + (action_dist.logp(train_batch["actions"]) * + train_batch["advantages"])) + else: + return (-0.1 * action_dist.entropy() - tf.reduce_mean( + action_dist.logp(train_batch["actions"]) * + train_batch["advantages"])) - EntropyPolicy = PGTFPolicy.with_updates( + policy_cls = PGTorchPolicy if args.torch else PGTFPolicy + EntropyPolicy = policy_cls.with_updates( loss_fn=entropy_policy_gradient_loss) + EntropyLossPG = PGTrainer.with_updates( name="EntropyPG", get_policy_class=lambda _: EntropyPolicy) + run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG) if __name__ == "__main__": args = parser.parse_args() + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + run_same_policy(args, stop=stop) + print("run_same_policy: ok.") + run_heuristic_vs_learned(args, use_lstm=False) print("run_heuristic_vs_learned(w/o lstm): ok.") - run_same_policy(args) - print("run_same_policy: ok.") + run_heuristic_vs_learned(args, use_lstm=True) print("run_heuristic_vs_learned (w/ lstm): ok.") - run_with_custom_entropy_loss(args) + + run_with_custom_entropy_loss(args, stop=stop) print("run_with_custom_entropy_loss: ok.") diff --git a/rllib/examples/rollout_worker_custom_workflow.py b/rllib/examples/rollout_worker_custom_workflow.py index f6a323317..830efba71 100644 --- a/rllib/examples/rollout_worker_custom_workflow.py +++ b/rllib/examples/rollout_worker_custom_workflow.py @@ -13,8 +13,8 @@ import ray from ray import tune from ray.rllib.evaluation import RolloutWorker from ray.rllib.evaluation.metrics import collect_metrics +from ray.rllib.policy.policy import Policy from ray.rllib.policy.sample_batch import SampleBatch -from ray.rllib.policy.tests.test_policy import TestPolicy parser = argparse.ArgumentParser() parser.add_argument("--gpu", action="store_true") @@ -23,7 +23,7 @@ parser.add_argument("--num-workers", type=int, default=2) parser.add_argument("--num-cpus", type=int, default=0) -class CustomPolicy(TestPolicy): +class CustomPolicy(Policy): """Example of a custom policy written from scratch. You might find it more convenient to extend TF/TorchPolicy instead diff --git a/rllib/examples/two_trainer_workflow.py b/rllib/examples/two_trainer_workflow.py index c0ade66f6..91f93cd61 100644 --- a/rllib/examples/two_trainer_workflow.py +++ b/rllib/examples/two_trainer_workflow.py @@ -13,8 +13,10 @@ from ray import tune from ray.rllib.agents.trainer_template import build_trainer from ray.rllib.agents.dqn.dqn import DEFAULT_CONFIG as DQN_CONFIG from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy +from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG as PPO_CONFIG from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy +from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.execution.common import _get_shared_metrics from ray.rllib.execution.concurrency_ops import Concurrently @@ -25,10 +27,16 @@ from ray.rllib.execution.replay_ops import StoreToReplayBuffer, Replay from ray.rllib.execution.train_ops import TrainOneStep, UpdateTargetNetwork from ray.rllib.examples.env.multi_agent import MultiAgentCartPole from ray.rllib.optimizers.async_replay_optimizer import LocalReplayBuffer +from ray.rllib.utils.test_utils import check_learning_achieved from ray.tune.registry import register_env parser = argparse.ArgumentParser() -parser.add_argument("--num-iters", type=int, default=20) +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--torch", action="store_true") +parser.add_argument("--mixed-torch-tf", action="store_true") +parser.add_argument("--stop-iters", type=int, default=20) +parser.add_argument("--stop-reward", type=float, default=150.0) +parser.add_argument("--stop-timesteps", type=int, default=100000) def custom_training_workflow(workers: WorkerSet, config: dict): @@ -90,6 +98,9 @@ def custom_training_workflow(workers: WorkerSet, config: dict): if __name__ == "__main__": args = parser.parse_args() + assert not (args.torch and args.mixed_torch_tf),\ + "Use either --torch or --mixed-torch-tf, not both!" + ray.init() # Simple environment with 4 independent cartpole entities @@ -102,8 +113,10 @@ if __name__ == "__main__": # Note that since the trainer below does not include a default policy or # policy configs, we have to explicitly set it in the multiagent config: policies = { - "ppo_policy": (PPOTFPolicy, obs_space, act_space, PPO_CONFIG), - "dqn_policy": (DQNTFPolicy, obs_space, act_space, DQN_CONFIG), + "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else + PPOTFPolicy, obs_space, act_space, PPO_CONFIG), + "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy, + obs_space, act_space, DQN_CONFIG), } def policy_mapping_fn(agent_id): @@ -117,16 +130,27 @@ if __name__ == "__main__": default_policy=None, execution_plan=custom_training_workflow) - tune.run( - MyTrainer, - stop={"training_iteration": args.num_iters}, - config={ - "rollout_fragment_length": 50, - "num_workers": 0, - "env": "multi_agent_cartpole", - "multiagent": { - "policies": policies, - "policy_mapping_fn": policy_mapping_fn, - "policies_to_train": ["dqn_policy", "ppo_policy"], - }, - }) + config = { + "rollout_fragment_length": 50, + "num_workers": 0, + "env": "multi_agent_cartpole", + "multiagent": { + "policies": policies, + "policy_mapping_fn": policy_mapping_fn, + "policies_to_train": ["dqn_policy", "ppo_policy"], + }, + "use_pytorch": args.torch, + } + + stop = { + "training_iteration": args.stop_iters, + "timesteps_total": args.stop_timesteps, + "episode_reward_mean": args.stop_reward, + } + + results = tune.run(MyTrainer, config=config, stop=stop) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/examples/twostep_game.py b/rllib/examples/twostep_game.py index 4252312ca..70926eb43 100644 --- a/rllib/examples/twostep_game.py +++ b/rllib/examples/twostep_game.py @@ -17,11 +17,15 @@ from ray import tune from ray.tune import register_env, grid_search from ray.rllib.env.multi_agent_env import ENV_STATE from ray.rllib.examples.env.two_step_game import TwoStepGame +from ray.rllib.utils.test_utils import check_learning_achieved parser = argparse.ArgumentParser() -parser.add_argument("--stop", type=int, default=50000) parser.add_argument("--run", type=str, default="PG") parser.add_argument("--num-cpus", type=int, default=0) +parser.add_argument("--as-test", action="store_true") +parser.add_argument("--torch", action="store_true") +parser.add_argument("--stop-reward", type=float, default=7.0) +parser.add_argument("--stop-timesteps", type=int, default=50000) if __name__ == "__main__": args = parser.parse_args() @@ -73,6 +77,7 @@ if __name__ == "__main__": }, "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2", }, + "use_pytorch": args.torch, } group = False elif args.run == "QMIX": @@ -87,6 +92,7 @@ if __name__ == "__main__": "separate_state_space": True, "one_hot_state_encoding": True }, + "use_pytorch": args.torch, } group = True elif args.run == "APEX_QMIX": @@ -107,6 +113,7 @@ if __name__ == "__main__": "separate_state_space": True, "one_hot_state_encoding": True }, + "use_pytorch": args.torch, } group = True else: @@ -114,12 +121,19 @@ if __name__ == "__main__": group = False ray.init(num_cpus=args.num_cpus or None) - tune.run( - args.run, - stop={ - "timesteps_total": args.stop, - }, - config=dict(config, **{ - "env": "grouped_twostep" if group else TwoStepGame, - }), - ) + + stop = { + "episode_reward_mean": args.stop_reward, + "timesteps_total": args.stop_timesteps, + } + + config = dict(config, **{ + "env": "grouped_twostep" if group else TwoStepGame, + }) + + results = tune.run(args.run, stop=stop, config=config) + + if args.as_test: + check_learning_achieved(results, args.stop_reward) + + ray.shutdown() diff --git a/rllib/policy/tests/test_policy.py b/rllib/policy/tests/OBSOLETE_test_policy.py similarity index 100% rename from rllib/policy/tests/test_policy.py rename to rllib/policy/tests/OBSOLETE_test_policy.py diff --git a/rllib/tests/test_multi_agent_env.py b/rllib/tests/test_multi_agent_env.py index a94ae3207..373344acd 100644 --- a/rllib/tests/test_multi_agent_env.py +++ b/rllib/tests/test_multi_agent_env.py @@ -7,16 +7,16 @@ from ray.tune.registry import register_env from ray.rllib.agents.pg import PGTrainer from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy +from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv +from ray.rllib.evaluation.rollout_worker import RolloutWorker +from ray.rllib.evaluation.metrics import collect_metrics +from ray.rllib.evaluation.worker_set import WorkerSet from ray.rllib.examples.env.multi_agent import MultiAgentCartPole, \ BasicMultiAgent, EarlyDoneMultiAgent, RoundRobinMultiAgent +from ray.rllib.examples.policy.random_policy import RandomPolicy from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer, AsyncGradientsOptimizer) from ray.rllib.tests.test_rollout_worker import MockPolicy -from ray.rllib.evaluation.rollout_worker import RolloutWorker -from ray.rllib.policy.tests.test_policy import TestPolicy -from ray.rllib.evaluation.metrics import collect_metrics -from ray.rllib.evaluation.worker_set import WorkerSet -from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv def one_hot(i, n): @@ -297,7 +297,7 @@ class TestMultiAgentEnv(unittest.TestCase): def test_custom_rnn_state_values(self): h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}} - class StatefulPolicy(TestPolicy): + class StatefulPolicy(RandomPolicy): def compute_actions(self, obs_batch, state_batches=None, diff --git a/rllib/tests/test_rollout_worker.py b/rllib/tests/test_rollout_worker.py index 78d38da82..798eb0925 100644 --- a/rllib/tests/test_rollout_worker.py +++ b/rllib/tests/test_rollout_worker.py @@ -13,13 +13,13 @@ from ray.rllib.env.vector_env import VectorEnv from ray.rllib.evaluation.rollout_worker import RolloutWorker from ray.rllib.evaluation.metrics import collect_metrics from ray.rllib.evaluation.postprocessing import compute_advantages -from ray.rllib.policy.tests.test_policy import TestPolicy +from ray.rllib.examples.policy.random_policy import RandomPolicy from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch from ray.rllib.utils.test_utils import check from ray.tune.registry import register_env -class MockPolicy(TestPolicy): +class MockPolicy(RandomPolicy): def compute_actions(self, obs_batch, state_batches=None, @@ -40,7 +40,7 @@ class MockPolicy(TestPolicy): batch, 100.0, 0.9, use_gae=False, use_critic=False) -class BadPolicy(MockPolicy): +class BadPolicy(RandomPolicy): def compute_actions(self, obs_batch, state_batches=None,