[RLlib] Examples folder restructuring (Model examples; final part). (#8278)

- This PR completes any previously missing PyTorch Model counterparts to TFModels in examples/models. - It also makes sure, all example scripts in the rllib/examples folder are tested for both frameworks and learn the given task (this is often currently not checked) using a --as-test flag in connection with a --stop-reward.
2026-06-27 20:53:14 +08:00 · 2020-05-12 08:23:10 +02:00
parent 9d012626e5
commit 57544b1ff9
41 changed files with 1466 additions and 1584 deletions
@@ -219,14 +219,14 @@ matrix:
        - . ./ci/travis/ci.sh build
      script:
        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_A,examples_B rllib/...
-        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C rllib/...
-        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_L,examples_M,examples_N,examples_P rllib/...
-        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_U,examples_R,examples_S,examples_T rllib/...
+        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_C,examples_D rllib/...
+        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_E,examples_F,examples_G,examples_H,examples_I,examples_J,examples_K,examples_L,examples_M,examples_N,examples_O,examples_P rllib/...
+        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=examples_Q,examples_R,examples_S,examples_T,examples_U,examples_V,examples_W,examples_X,examples_Y,examples_Z rllib/...

    # RLlib: tests_dir: Everything in rllib/tests/ directory (A-I).
    - os: linux
      env:
-        - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_I=1
+        - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_A_TO_L=1
        - PYTHON=3.6
        - TF_VERSION=2.0.0b1
        - TFP_VERSION=0.8
@@ -237,12 +237,12 @@ matrix:
      before_script:
        - . ./ci/travis/ci.sh build
      script:
-        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I rllib/...
+        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_A,tests_dir_B,tests_dir_C,tests_dir_D,tests_dir_E,tests_dir_F,tests_dir_G,tests_dir_H,tests_dir_I,tests_dir_J,tests_dir_K,tests_dir_L rllib/...

    # RLlib: tests_dir: Everything in rllib/tests/ directory (J-Z).
    - os: linux
      env:
-        - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_J_TO_Z=1
+        - RLLIB_TESTING=1 RLLIB_TESTS_DIR_TESTS_M_TO_Z=1
        - PYTHON=3.6
        - TF_VERSION=2.0.0b1
        - TFP_VERSION=0.8
@@ -253,7 +253,7 @@ matrix:
      before_script:
        - . ./ci/travis/ci.sh build
      script:
-        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_J,tests_dir_K,tests_dir_L,tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/...
+        - ./ci/keep_alive bazel test --config=ci --build_tests_only --test_tag_filters=tests_dir_M,tests_dir_N,tests_dir_O,tests_dir_P,tests_dir_Q,tests_dir_R,tests_dir_S,tests_dir_T,tests_dir_U,tests_dir_V,tests_dir_W,tests_dir_X,tests_dir_Y,tests_dir_Z rllib/...

    # Cpp worker test
    - os: linux
@@ -27,8 +27,8 @@
 # 2) everything in a) using tf1.x
 # 3) everything in b) c) d) and e)
 # 4) everything in g)
-# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_I`
-# 6) f), BUT only those tagged `tests_dir_J` to `tests_dir_Z`
+# 5) f), BUT only those tagged `tests_dir_A` to `tests_dir_L`
+# 6) f), BUT only those tagged `tests_dir_M` to `tests_dir_Z`


 # --------------------------------------------------------------------
@@ -1453,95 +1453,215 @@ py_test(


 py_test(
-    name = "examples/autoregressive_action_dist", main = "examples/autoregressive_action_dist.py",
+    name = "examples/autoregressive_action_dist_tf",
+    main = "examples/autoregressive_action_dist.py",
    tags = ["examples", "examples_A"],
-    size = "large",
+    size = "medium",
    srcs = ["examples/autoregressive_action_dist.py"],
-    args = ["--stop=150", "--num-cpus=4"]
+    args = ["--as-test", "--stop-reward=150", "--num-cpus=4"]
 )

 py_test(
-    name = "examples/batch_norm_model_ppo", main="examples/batch_norm_model.py",
+    name = "examples/autoregressive_action_dist_torch",
+    main = "examples/autoregressive_action_dist.py",
+    tags = ["examples", "examples_A"],
+    size = "medium",
+    srcs = ["examples/autoregressive_action_dist.py"],
+    args = ["--as-test", "--torch", "--stop-reward=150", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/batch_norm_model_ppo_tf",
+    main = "examples/batch_norm_model.py",
    tags = ["examples", "examples_B"],
    size = "medium",
    srcs = ["examples/batch_norm_model.py"],
-    args = ["--run=PPO", "--num-iters=1"]
+    args = ["--as-test", "--run=PPO", "--stop-reward=80"]
 )

 py_test(
-    name = "examples/batch_norm_model_pg", main="examples/batch_norm_model.py",
+    name = "examples/batch_norm_model_ppo_torch",
+    main = "examples/batch_norm_model.py",
    tags = ["examples", "examples_B"],
    size = "medium",
    srcs = ["examples/batch_norm_model.py"],
-    args = ["--run=PG", "--num-iters=1"]
+    args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=80"]
 )

 py_test(
-    name = "examples/batch_norm_model_dqn", main="examples/batch_norm_model.py",
+    name = "examples/batch_norm_model_dqn_tf",
+    main = "examples/batch_norm_model.py",
    tags = ["examples", "examples_B"],
-    size = "medium",
+    size = "medium",  # DQN learns much slower with BatchNorm.
    srcs = ["examples/batch_norm_model.py"],
-    args = ["--run=DQN", "--num-iters=1"]
+    args = ["--as-test", "--run=DQN", "--stop-reward=70"]
 )

 py_test(
-    name = "examples/batch_norm_model_ddpg", main="examples/batch_norm_model.py",
+    name = "examples/batch_norm_model_dqn_torch",
+    main = "examples/batch_norm_model.py",
    tags = ["examples", "examples_B"],
-    size = "medium",
+    size = "medium",  # DQN learns much slower with BatchNorm.
    srcs = ["examples/batch_norm_model.py"],
-    args = ["--run=DDPG", "--num-iters=1"]
+    args = ["--as-test", "--torch", "--run=DQN", "--stop-reward=70"]
 )

 py_test(
-    name = "examples/cartpole_lstm_impala", main="examples/cartpole_lstm.py",
+    name = "examples/batch_norm_model_ddpg_tf",
+    main = "examples/batch_norm_model.py",
+    tags = ["examples", "examples_B"],
+    size = "small",
+    srcs = ["examples/batch_norm_model.py"],
+    args = ["--run=DDPG", "--stop-iters=1"]
+)
+
+py_test(
+    name = "examples/batch_norm_model_ddpg_torch",
+    main = "examples/batch_norm_model.py",
+    tags = ["examples", "examples_B"],
+    size = "small",
+    srcs = ["examples/batch_norm_model.py"],
+    args = ["--torch", "--run=DDPG", "--stop-iters=1"]
+)
+
+py_test(
+    name = "examples/cartpole_lstm_impala_tf",
+    main = "examples/cartpole_lstm.py",
+    tags = ["examples", "examples_C"],
+    size = "small",
+    srcs = ["examples/cartpole_lstm.py"],
+    args = ["--as-test", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/cartpole_lstm_impala_torch",
+    main = "examples/cartpole_lstm.py",
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/cartpole_lstm.py"],
-    args = ["--run=IMPALA", "--stop=40", "--num-cpus=4"]
+    args = ["--as-test", "--torch", "--run=IMPALA", "--stop-reward=40", "--num-cpus=4"]
 )

 py_test(
-    name = "examples/cartpole_lstm_ppo", main="examples/cartpole_lstm.py",
+    name = "examples/cartpole_lstm_ppo_tf",
+    main = "examples/cartpole_lstm.py",
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/cartpole_lstm.py"],
-    args = ["--run=PPO", "--stop=40", "--num-cpus=4"]
+    args = ["--as-test", "--run=PPO", "--stop-reward=40", "--num-cpus=4"]
 )

 py_test(
-    name = "examples/cartpole_lstm_ppo_with_prev_a_and_r", main="examples/cartpole_lstm.py",
+    name = "examples/cartpole_lstm_ppo_torch",
+    main = "examples/cartpole_lstm.py",
    tags = ["examples", "examples_C"],
-    size = "large",
+    size = "small",
    srcs = ["examples/cartpole_lstm.py"],
-    args = ["--run=PPO", "--stop=40", "--use-prev-action-reward", "--num-cpus=4"]
+    args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--num-cpus=4"]
 )

 py_test(
-    name = "examples/centralized_critic",
+    name = "examples/cartpole_lstm_ppo_tf_with_prev_a_and_r",
+    main = "examples/cartpole_lstm.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/cartpole_lstm.py"],
+    args = ["--as-test", "--run=PPO", "--stop-reward=40", "--use-prev-action-reward", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/centralized_critic_tf",
+    main = "examples/centralized_critic.py",
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/centralized_critic.py"],
-    args = ["--stop=2000"]
+    args = ["--as-test", "--stop-reward=7.2"]
 )

 py_test(
-    name = "examples/centralized_critic_2",
+    name = "examples/centralized_critic_torch",
+    main = "examples/centralized_critic.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/centralized_critic.py"],
+    args = ["--as-test", "--torch", "--stop-reward=7.2"]
+)
+
+py_test(
+    name = "examples/centralized_critic_2_tf",
+    main = "examples/centralized_critic_2.py",
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/centralized_critic_2.py"],
-    args = ["--stop=2000"]
+    args = ["--as-test", "--stop-reward=6.0"]
 )

 py_test(
-    name = "examples/custom_eval", main = "examples/custom_eval.py",
+    name = "examples/centralized_critic_2_torch",
+    main = "examples/centralized_critic_2.py",
    tags = ["examples", "examples_C"],
    size = "medium",
-    srcs = ["examples/custom_eval.py"],
-    args = ["--custom-eval", "--num-cpus=4"]
+    srcs = ["examples/centralized_critic_2.py"],
+    args = ["--as-test", "--torch", "--stop-reward=6.0"]
 )

 py_test(
-    name = "examples/custom_keras_model_a2c", main="examples/custom_keras_model.py",
+    name = "examples/custom_env_tf",
+    main = "examples/custom_env.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/custom_env.py"],
+    args = ["--as-test"]
+)
+
+py_test(
+    name = "examples/custom_env_torch",
+    main = "examples/custom_env.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/custom_env.py"],
+    args = ["--as-test", "--torch"]
+)
+
+py_test(
+    name = "examples/custom_eval_tf",
+    main = "examples/custom_eval.py",
+    tags = ["examples", "examples_C"],
+    size = "small",
+    srcs = ["examples/custom_eval.py"],
+    args = ["--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_eval_torch",
+    main = "examples/custom_eval.py",
+    tags = ["examples", "examples_C"],
+    size = "small",
+    srcs = ["examples/custom_eval.py"],
+    args = ["--torch", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_fast_model_tf",
+    main = "examples/custom_fast_model.py",
+    tags = ["examples", "examples_C"],
+    size = "small",
+    srcs = ["examples/custom_fast_model.py"],
+    args = ["--stop-iters=1", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_fast_model_torch",
+    main = "examples/custom_fast_model.py",
+    tags = ["examples", "examples_C"],
+    size = "small",
+    srcs = ["examples/custom_fast_model.py"],
+    args = ["--torch", "--stop-iters=1", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_keras_model_a2c",
+    main = "examples/custom_keras_model.py",
    tags = ["examples", "examples_C"],
    size = "large",
    srcs = ["examples/custom_keras_model.py"],
@@ -1549,7 +1669,8 @@ py_test(
 )

 py_test(
-    name = "examples/custom_keras_model_dqn", main="examples/custom_keras_model.py",
+    name = "examples/custom_keras_model_dqn",
+    main = "examples/custom_keras_model.py",
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/custom_keras_model.py"],
@@ -1557,7 +1678,8 @@ py_test(
 )

 py_test(
-    name = "examples/custom_keras_model_ppo", main="examples/custom_keras_model.py",
+    name = "examples/custom_keras_model_ppo",
+    main = "examples/custom_keras_model.py",
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/custom_keras_model.py"],
@@ -1565,46 +1687,79 @@ py_test(
 )

 py_test(
-    name = "examples/custom_keras_rnn_model_repeat_after_me", main = "examples/custom_keras_rnn_model.py",
-    tags = ["examples", "examples_C"],
-    size = "large",
-    srcs = ["examples/custom_keras_rnn_model.py"],
-    args = ["--run=PPO", "--stop=50", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
-)
-
-py_test(
-    name = "examples/custom_keras_rnn_model_repeat_initial",
-    main = "examples/custom_keras_rnn_model.py",
-    tags = ["examples", "examples_C"],
-    size = "large",
-    srcs = ["examples/custom_keras_rnn_model.py"],
-    args = ["--run=PPO", "--stop=50", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
-)
-
-py_test(
-    name = "examples/custom_loss",
+    name = "examples/custom_loss_tf",
+    main = "examples/custom_loss.py",
    tags = ["examples", "examples_C"],
    size = "small",
    # Include the json data file.
    data = glob(["tests/data/cartpole_small/**"]),
    srcs = ["examples/custom_loss.py"],
-    args = ["--iters=2", "--input-files=tests/data/cartpole_small"]
+    args = ["--stop-iters=2", "--input-files=tests/data/cartpole_small"]
+)
+
+py_test(
+    name = "examples/custom_loss_torch",
+    main = "examples/custom_loss.py",
+    tags = ["examples", "examples_C"],
+    size = "small",
+    # Include the json data file.
+    data = glob(["tests/data/cartpole_small/**"]),
+    srcs = ["examples/custom_loss.py"],
+    args = ["--torch", "--stop-iters=2", "--input-files=tests/data/cartpole_small"]
 )

 py_test(
    name = "examples/custom_metrics_and_callbacks",
+    main = "examples/custom_metrics_and_callbacks.py",
    tags = ["examples", "examples_C"],
    size = "small",
    srcs = ["examples/custom_metrics_and_callbacks.py"],
-    args = ["--num-iters=2"]
+    args = ["--stop-iters=2"]
 )

 py_test(
    name = "examples/custom_metrics_and_callbacks_legacy",
+    main = "examples/custom_metrics_and_callbacks_legacy.py",
    tags = ["examples", "examples_C"],
    size = "small",
    srcs = ["examples/custom_metrics_and_callbacks_legacy.py"],
-    args = ["--num-iters=2"]
+    args = ["--stop-iters=2"]
+)
+
+py_test(
+    name = "examples/custom_rnn_model_repeat_after_me_tf",
+    main = "examples/custom_rnn_model.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/custom_rnn_model.py"],
+    args = ["--as-test", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_rnn_model_repeat_initial_obs_tf",
+    main = "examples/custom_rnn_model.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/custom_rnn_model.py"],
+    args = ["--as-test", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_rnn_model_repeat_after_me_torch",
+    main = "examples/custom_rnn_model.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/custom_rnn_model.py"],
+    args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=40", "--env=RepeatAfterMeEnv", "--num-cpus=4"]
+)
+
+py_test(
+    name = "examples/custom_rnn_model_repeat_initial_obs_torch",
+    main = "examples/custom_rnn_model.py",
+    tags = ["examples", "examples_C"],
+    size = "medium",
+    srcs = ["examples/custom_rnn_model.py"],
+    args = ["--as-test", "--torch", "--run=PPO", "--stop-reward=10", "--stop-timesteps=300000", "--env=RepeatInitialObsEnv", "--num-cpus=4"]
 )

 py_test(
@@ -1612,16 +1767,7 @@ py_test(
    tags = ["examples", "examples_C"],
    size = "medium",
    srcs = ["examples/custom_tf_policy.py"],
-    args = ["--iters=2", "--num-cpus=4"]
-)
-
-py_test(
-    name = "examples/custom_torch_rnn_model",
-    main = "examples/custom_torch_rnn_model.py",
-    tags = ["examples", "examples_C"],
-    size = "medium",
-    srcs = ["examples/custom_torch_rnn_model.py"],
-    args = ["--run=PPO", "--stop=90", "--num-cpus=4"]
+    args = ["--stop-iters=2", "--num-cpus=4"]
 )

 py_test(
@@ -1629,7 +1775,7 @@ py_test(
    tags = ["examples", "examples_C"],
    size = "small",
    srcs = ["examples/custom_torch_policy.py"],
-    args = ["--iters=2", "--num-cpus=4"]
+    args = ["--stop-iters=2", "--num-cpus=4"]
 )

 py_test(
@@ -1637,90 +1783,151 @@ py_test(
    tags = ["examples", "examples_E"],
    size = "small",
    srcs = ["examples/eager_execution.py"],
-    args = ["--iters=2"]
+    args = ["--stop-iters=2"]
 )

 py_test(
    name = "examples/hierarchical_training_tf",
+    main = "examples/hierarchical_training.py",
    tags = ["examples", "examples_H"],
-    size = "small",
+    size = "medium",
    srcs = ["examples/hierarchical_training.py"],
    args = ["--stop-reward=0.0"]
 )

 py_test(
    name = "examples/hierarchical_training_torch",
+    main = "examples/hierarchical_training.py",
    tags = ["examples", "examples_H"],
-    size = "small",
+    size = "medium",
    srcs = ["examples/hierarchical_training.py"],
    args = ["--torch", "--stop-reward=0.0"]
 )

+# Do not run this test (MobileNetV2 is gigantic and takes forever for 1 iter).
+# py_test(
+#     name = "examples/mobilenet_v2_with_lstm_tf",
+#     main = "examples/mobilenet_v2_with_lstm.py",
+#     tags = ["examples", "examples_M"],
+#     size = "small",
+#     srcs = ["examples/mobilenet_v2_with_lstm.py"]
+# )
+
 py_test(
-    name = "examples/multi_agent_cartpole",
+    name = "examples/multi_agent_cartpole_tf",
+    main = "examples/multi_agent_cartpole.py",
    tags = ["examples", "examples_M"],
    size = "medium",
    srcs = ["examples/multi_agent_cartpole.py"],
-    args = ["--num-iters=2", "--num-cpus=4"]
+    args = ["--as-test", "--stop-reward=70.0", "--num-cpus=4"]
 )

 py_test(
-    name = "examples/multi_agent_custom_policy",
+    name = "examples/multi_agent_cartpole_torch",
+    main = "examples/multi_agent_cartpole.py",
    tags = ["examples", "examples_M"],
-    size = "medium",
-    srcs = ["examples/multi_agent_custom_policy.py"],
+    size = "small",
+    srcs = ["examples/multi_agent_cartpole.py"],
+    args = ["--as-test", "--torch", "--stop-reward=70.0", "--num-cpus=4"]
 )

 py_test(
-    name = "examples/multi_agent_two_trainers",
+    name = "examples/multi_agent_custom_policy_tf",
+    main = "examples/multi_agent_custom_policy.py",
+    tags = ["examples", "examples_M"],
+    size = "small",
+    srcs = ["examples/multi_agent_custom_policy.py"],
+    args = ["--as-test", "--stop-reward=80"]
+)
+
+py_test(
+    name = "examples/multi_agent_custom_policy_torch",
+    main = "examples/multi_agent_custom_policy.py",
+    tags = ["examples", "examples_M"],
+    size = "small",
+    srcs = ["examples/multi_agent_custom_policy.py"],
+    args = ["--as-test", "--torch", "--stop-reward=80"]
+)
+
+py_test(
+    name = "examples/multi_agent_two_trainers_tf",
+    main = "examples/multi_agent_two_trainers.py",
    tags = ["examples", "examples_M"],
    size = "medium",
    srcs = ["examples/multi_agent_two_trainers.py"],
-    args = ["--num-iters=2"]
+    args = ["--as-test", "--stop-reward=70"]
 )

 py_test(
-    name = "examples/two_trainer_workflow",
-    tags = ["examples", "examples_T"],
+    name = "examples/multi_agent_two_trainers_torch",
+    main = "examples/multi_agent_two_trainers.py",
+    tags = ["examples", "examples_M"],
    size = "medium",
-    srcs = ["examples/two_trainer_workflow.py"],
-    args = ["--num-iters=2"]
+    srcs = ["examples/multi_agent_two_trainers.py"],
+    args = ["--as-test", "--torch", "--stop-reward=70"]
 )

 py_test(
-    name = "examples/nested_action_spaces_ppo",
+    name = "examples/multi_agent_two_trainers_mixed_torch_tf",
+    main = "examples/multi_agent_two_trainers.py",
+    tags = ["examples", "examples_M"],
+    size = "small",
+    srcs = ["examples/multi_agent_two_trainers.py"],
+    args = ["--as-test", "--mixed-torch-tf", "--stop-reward=70"]
+)
+
+py_test(
+    name = "examples/nested_action_spaces_ppo_tf",
    main = "examples/nested_action_spaces.py",
    tags = ["examples", "examples_N"],
    size = "medium",
    srcs = ["examples/nested_action_spaces.py"],
-    args = ["--stop=-500", "--run=PPO"]
+    args = ["--as-test", "--stop-reward=-600", "--run=PPO"]
 )

 py_test(
-    name = "examples/parametric_actions_cartpole_pg",
+    name = "examples/nested_action_spaces_ppo_torch",
+    main = "examples/nested_action_spaces.py",
+    tags = ["examples", "examples_N"],
+    size = "medium",
+    srcs = ["examples/nested_action_spaces.py"],
+    args = ["--as-test", "--torch", "--stop-reward=-600", "--run=PPO"]
+)
+
+py_test(
+    name = "examples/parametric_actions_cartpole_pg_tf",
    main = "examples/parametric_actions_cartpole.py",
    tags = ["examples", "examples_P"],
    size = "medium",
    srcs = ["examples/parametric_actions_cartpole.py"],
-    args = ["--run=PG", "--stop=50"]
+    args = ["--as-test", "--stop-reward=60.0", "--run=PG"]
 )

 py_test(
-    name = "examples/parametric_actions_cartpole_ppo",
+    name = "examples/parametric_actions_cartpole_dqn_tf",
    main = "examples/parametric_actions_cartpole.py",
    tags = ["examples", "examples_P"],
    size = "medium",
    srcs = ["examples/parametric_actions_cartpole.py"],
-    args = ["--run=PPO", "--stop=50"]
+    args = ["--as-test", "--stop-reward=60.0", "--run=DQN"]
 )

 py_test(
-    name = "examples/parametric_actions_cartpole_dqn",
+    name = "examples/parametric_actions_cartpole_pg_torch",
+    main = "examples/parametric_actions_cartpole.py",
+    tags = ["examples", "examples_P"],
+    size = "small",
+    srcs = ["examples/parametric_actions_cartpole.py"],
+    args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=PG"]
+)
+
+py_test(
+    name = "examples/parametric_actions_cartpole_dqn_torch",
    main = "examples/parametric_actions_cartpole.py",
    tags = ["examples", "examples_P"],
    size = "medium",
    srcs = ["examples/parametric_actions_cartpole.py"],
-    args = ["--run=DQN", "--stop=50"]
+    args = ["--as-test", "--torch", "--stop-reward=60.0", "--run=DQN"]
 )

 py_test(
@@ -1731,9 +1938,27 @@ py_test(
    args = ["--num-cpus=4"]
 )

+py_test(
+    name = "examples/rock_paper_scissors_multiagent_tf",
+    main = "examples/rock_paper_scissors_multiagent.py",
+    tags = ["examples", "examples_R"],
+    size = "medium",
+    srcs = ["examples/rock_paper_scissors_multiagent.py"],
+    args = ["--as-test"],
+)
+
+py_test(
+    name = "examples/rock_paper_scissors_multiagent_torch",
+    main = "examples/rock_paper_scissors_multiagent.py",
+    tags = ["examples", "examples_R"],
+    size = "medium",
+    srcs = ["examples/rock_paper_scissors_multiagent.py"],
+    args = ["--as-test", "--torch"],
+)
+
 sh_test(
    name = "examples/serving/test_local_inference",
-    tags = ["examples", "examples_L", "exclusive"],
+    tags = ["examples", "examples_S", "exclusive"],
    size = "medium",
    srcs = ["examples/serving/test_local_inference.sh"],
    data = glob(["examples/serving/*.py"]),
@@ -1741,27 +1966,82 @@ sh_test(

 sh_test(
    name = "examples/serving/test_remote_inference",
-    tags = ["examples", "examples_R", "exclusive"],
+    tags = ["examples", "examples_S", "exclusive"],
    size = "medium",
    srcs = ["examples/serving/test_remote_inference.sh"],
    data = glob(["examples/serving/*.py"]),
 )

 py_test(
-    name = "examples/rock_paper_scissors_multiagent",
-    main = "examples/rock_paper_scissors_multiagent.py",
-    tags = ["examples", "examples_R"],
-    size = "large",
-    srcs = ["examples/rock_paper_scissors_multiagent.py"],
-    args = ["--stop=200"],
+    name = "examples/two_trainer_workflow_tf",
+    main = "examples/two_trainer_workflow.py",
+    tags = ["examples", "examples_T"],
+    size = "small",
+    srcs = ["examples/two_trainer_workflow.py"],
+    args = ["--as-test", "--stop-reward=100.0"]
 )

 py_test(
-    name = "examples/twostep_game_maddpg", main = "examples/twostep_game.py",
+    name = "examples/two_trainer_workflow_torch",
+    main = "examples/two_trainer_workflow.py",
+    tags = ["examples", "examples_T"],
+    size = "small",
+    srcs = ["examples/two_trainer_workflow.py"],
+    args = ["--as-test", "--torch", "--stop-reward=100.0"]
+)
+
+py_test(
+    name = "examples/two_trainer_workflow_mixed_torch_tf",
+    main = "examples/two_trainer_workflow.py",
+    tags = ["examples", "examples_T"],
+    size = "small",
+    srcs = ["examples/two_trainer_workflow.py"],
+    args = ["--as-test", "--mixed-torch-tf", "--stop-reward=100.0"]
+)
+
+py_test(
+    name = "examples/twostep_game_maddpg",
+    main = "examples/twostep_game.py",
    tags = ["examples", "examples_T"],
    size = "large",
    srcs = ["examples/twostep_game.py"],
-    args = ["--stop=2000", "--run=contrib/MADDPG"]
+    args = ["--stop-timesteps=2000", "--run=contrib/MADDPG"]
+)
+
+py_test(
+    name = "examples/twostep_game_pg_tf",
+    main = "examples/twostep_game.py",
+    tags = ["examples", "examples_T"],
+    size = "medium",
+    srcs = ["examples/twostep_game.py"],
+    args = ["--as-test", "--stop-reward=7", "--run=PG"]
+)
+
+py_test(
+    name = "examples/twostep_game_pg_torch",
+    main = "examples/twostep_game.py",
+    tags = ["examples", "examples_T"],
+    size = "medium",
+    srcs = ["examples/twostep_game.py"],
+    args = ["--as-test", "--torch", "--stop-reward=7", "--run=PG"]
+)
+
+py_test(
+    name = "examples/twostep_game_qmix",
+    main = "examples/twostep_game.py",
+    tags = ["examples", "examples_T"],
+    size = "medium",
+    srcs = ["examples/twostep_game.py"],
+    args = ["--stop-timesteps=2000", "--run=QMIX"]
+)
+
+py_test(
+    name = "examples/twostep_game_apex_qmix",
+    main = "examples/twostep_game.py",
+    tags = ["examples", "examples_T"],
+    size = "medium",
+    srcs = ["examples/twostep_game.py"],
+    args = ["--stop-timesteps=2000", "--run=APEX_QMIX", "--num-cpus=4"]
 )

 py_test(
@@ -1770,7 +2050,7 @@ py_test(
    tags = ["examples", "examples_T"],
    size = "small",
    srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
-    args = ["--stop-at-reward=10", "--run=contrib/LinTS"],
+    args = ["--as-test", "--stop-reward=10", "--run=contrib/LinTS"],
 )

 py_test(
@@ -1779,29 +2059,5 @@ py_test(
    tags = ["examples", "examples_U"],
    size = "small",
    srcs = ["contrib/bandits/examples/simple_context_bandit.py"],
-    args = ["--stop-at-reward=10", "--run=contrib/LinUCB"],
-)
-
-py_test(
-    name = "examples/twostep_game_pg", main = "examples/twostep_game.py",
-    tags = ["examples", "examples_T"],
-    size = "medium",
-    srcs = ["examples/twostep_game.py"],
-    args = ["--stop=2000", "--run=PG"]
-)
-
-py_test(
-    name = "examples/twostep_game_qmix", main = "examples/twostep_game.py",
-    tags = ["examples", "examples_T"],
-    size = "medium",
-    srcs = ["examples/twostep_game.py"],
-    args = ["--stop=2000", "--run=QMIX"]
-)
-
-py_test(
-    name = "examples/twostep_game_apex_qmix", main = "examples/twostep_game.py",
-    tags = ["examples", "examples_T"],
-    size = "medium",
-    srcs = ["examples/twostep_game.py"],
-    args = ["--stop=2000", "--run=APEX_QMIX", "--num-cpus=4"]
+    args = ["--as-test", "--stop-reward=10", "--run=contrib/LinUCB"],
 )
@@ -59,8 +59,13 @@ def apply_grad_clipping(policy, optimizer, loss):
    info = {}
    if policy.config["grad_clip"]:
        for param_group in optimizer.param_groups:
-            info["grad_gnorm"] = nn.utils.clip_grad_norm_(
-                param_group["params"], policy.config["grad_clip"])
+            # Make sure we only pass params with grad != None into torch
+            # clip_grad_norm_. Would fail otherwise.
+            params = list(
+                filter(lambda p: p.grad is not None, param_group["params"]))
+            if params:
+                info["grad_gnorm"] = nn.utils.clip_grad_norm_(
+                    params, policy.config["grad_clip"])
    return info


@@ -45,9 +45,9 @@ class DDPGTorchModel(TorchModelV2, nn.Module):
        only defines the layers for the output heads. Those layers for
        forward() should be defined in subclasses of DDPGTorchModel.
        """
-        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
-                              model_config, name)
        nn.Module.__init__(self)
+        super(DDPGTorchModel, self).__init__(obs_space, action_space,
+                                             num_outputs, model_config, name)

        self.bounded = np.logical_and(action_space.bounded_above,
                                      action_space.bounded_below).any()
@@ -58,7 +58,7 @@ class DDPGTorchModel(TorchModelV2, nn.Module):

        # Build the policy network.
        self.policy_model = nn.Sequential()
-        ins = obs_space.shape[-1]
+        ins = num_outputs
        self.obs_ins = ins
        activation = get_activation_fn(
            actor_hidden_activation, framework="torch")
@@ -1,16 +1,20 @@
 """A very simple contextual bandit example with 3 arms."""

 import argparse
-import random
-import numpy as np
 import gym
 from gym.spaces import Discrete, Box
+import numpy as np
+import random

 from ray import tune
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
-parser.add_argument("--stop-at-reward", type=float, default=10)
 parser.add_argument("--run", type=str, default="contrib/LinUCB")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=10.0)


 class SimpleContextualBandit(gym.Env):
@@ -37,11 +41,18 @@ class SimpleContextualBandit(gym.Env):

 if __name__ == "__main__":
    args = parser.parse_args()
-    tune.run(
-        args.run,
-        stop={
-            "episode_reward_mean": args.stop_at_reward,
-        },
-        config={
-            "env": SimpleContextualBandit,
-        })
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    config = {
+        "env": SimpleContextualBandit,
+    }
+
+    results = tune.run(args.run, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
@@ -10,187 +10,56 @@ pattern, and a custom action distribution class that leverages that model.
 This examples shows both.
 """

-from gym.spaces import Discrete, Tuple
 import argparse

 import ray
 from ray import tune
 from ray.rllib.examples.env.correlated_actions_env import CorrelatedActionsEnv
+from ray.rllib.examples.models.autoregressive_action_model import \
+    AutoregressiveActionModel, TorchAutoregressiveActionModel
+from ray.rllib.examples.models.autoregressive_action_dist import \
+    BinaryAutoregressiveDistribution, TorchBinaryAutoregressiveDistribution
 from ray.rllib.models import ModelCatalog
-from ray.rllib.models.tf.tf_action_dist import Categorical, ActionDistribution
-from ray.rllib.models.tf.misc import normc_initializer
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils.framework import try_import_tf
-
-tf = try_import_tf()
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
 parser.add_argument("--run", type=str, default="PPO")  # try PG, PPO, IMPALA
-parser.add_argument("--stop", type=int, default=200)
+parser.add_argument("--torch", action="store_true")
 parser.add_argument("--num-cpus", type=int, default=0)
-
-
-class BinaryAutoregressiveOutput(ActionDistribution):
-    """Action distribution P(a1, a2) = P(a1) * P(a2 | a1)"""
-
-    @staticmethod
-    def required_model_output_shape(self, model_config):
-        return 16  # controls model output feature vector size
-
-    def deterministic_sample(self):
-        # first, sample a1
-        a1_dist = self._a1_distribution()
-        a1 = a1_dist.deterministic_sample()
-
-        # sample a2 conditioned on a1
-        a2_dist = self._a2_distribution(a1)
-        a2 = a2_dist.deterministic_sample()
-        self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
-
-        # return the action tuple
-        return (a1, a2)
-
-    def sample(self):
-        # first, sample a1
-        a1_dist = self._a1_distribution()
-        a1 = a1_dist.sample()
-
-        # sample a2 conditioned on a1
-        a2_dist = self._a2_distribution(a1)
-        a2 = a2_dist.sample()
-        self._action_logp = a1_dist.logp(a1) + a2_dist.logp(a2)
-
-        # return the action tuple
-        return (a1, a2)
-
-    def logp(self, actions):
-        a1, a2 = actions[:, 0], actions[:, 1]
-        a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
-        a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec])
-        return (
-            Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2))
-
-    def sampled_action_logp(self):
-        return tf.exp(self._action_logp)
-
-    def entropy(self):
-        a1_dist = self._a1_distribution()
-        a2_dist = self._a2_distribution(a1_dist.sample())
-        return a1_dist.entropy() + a2_dist.entropy()
-
-    def kl(self, other):
-        a1_dist = self._a1_distribution()
-        a1_terms = a1_dist.kl(other._a1_distribution())
-
-        a1 = a1_dist.sample()
-        a2_terms = self._a2_distribution(a1).kl(other._a2_distribution(a1))
-        return a1_terms + a2_terms
-
-    def _a1_distribution(self):
-        BATCH = tf.shape(self.inputs)[0]
-        a1_logits, _ = self.model.action_model(
-            [self.inputs, tf.zeros((BATCH, 1))])
-        a1_dist = Categorical(a1_logits)
-        return a1_dist
-
-    def _a2_distribution(self, a1):
-        a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1)
-        _, a2_logits = self.model.action_model([self.inputs, a1_vec])
-        a2_dist = Categorical(a2_logits)
-        return a2_dist
-
-
-class AutoregressiveActionsModel(TFModelV2):
-    """Implements the `.action_model` branch required above."""
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        super(AutoregressiveActionsModel, self).__init__(
-            obs_space, action_space, num_outputs, model_config, name)
-        if action_space != Tuple([Discrete(2), Discrete(2)]):
-            raise ValueError(
-                "This model only supports the [2, 2] action space")
-
-        # Inputs
-        obs_input = tf.keras.layers.Input(
-            shape=obs_space.shape, name="obs_input")
-        a1_input = tf.keras.layers.Input(shape=(1, ), name="a1_input")
-        ctx_input = tf.keras.layers.Input(
-            shape=(num_outputs, ), name="ctx_input")
-
-        # Output of the model (normally 'logits', but for an autoregressive
-        # dist this is more like a context/feature layer encoding the obs)
-        context = tf.keras.layers.Dense(
-            num_outputs,
-            name="hidden",
-            activation=tf.nn.tanh,
-            kernel_initializer=normc_initializer(1.0))(obs_input)
-
-        # V(s)
-        value_out = tf.keras.layers.Dense(
-            1,
-            name="value_out",
-            activation=None,
-            kernel_initializer=normc_initializer(0.01))(context)
-
-        # P(a1 | obs)
-        a1_logits = tf.keras.layers.Dense(
-            2,
-            name="a1_logits",
-            activation=None,
-            kernel_initializer=normc_initializer(0.01))(ctx_input)
-
-        # P(a2 | a1)
-        # --note: typically you'd want to implement P(a2 | a1, obs) as follows:
-        # a2_context = tf.keras.layers.Concatenate(axis=1)(
-        #     [ctx_input, a1_input])
-        a2_context = a1_input
-        a2_hidden = tf.keras.layers.Dense(
-            16,
-            name="a2_hidden",
-            activation=tf.nn.tanh,
-            kernel_initializer=normc_initializer(1.0))(a2_context)
-        a2_logits = tf.keras.layers.Dense(
-            2,
-            name="a2_logits",
-            activation=None,
-            kernel_initializer=normc_initializer(0.01))(a2_hidden)
-
-        # Base layers
-        self.base_model = tf.keras.Model(obs_input, [context, value_out])
-        self.register_variables(self.base_model.variables)
-        self.base_model.summary()
-
-        # Autoregressive action sampler
-        self.action_model = tf.keras.Model([ctx_input, a1_input],
-                                           [a1_logits, a2_logits])
-        self.action_model.summary()
-        self.register_variables(self.action_model.variables)
-
-    def forward(self, input_dict, state, seq_lens):
-        context, self._value_out = self.base_model(input_dict["obs"])
-        return context, state
-
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=200)

 if __name__ == "__main__":
    args = parser.parse_args()
    ray.init(num_cpus=args.num_cpus or None)
-    ModelCatalog.register_custom_model("autoregressive_model",
-                                       AutoregressiveActionsModel)
-    ModelCatalog.register_custom_action_dist("binary_autoreg_output",
-                                             BinaryAutoregressiveOutput)
-    tune.run(
-        args.run,
-        stop={"episode_reward_mean": args.stop},
-        config={
-            "env": CorrelatedActionsEnv,
-            "gamma": 0.5,
-            "num_gpus": 0,
-            "model": {
-                "custom_model": "autoregressive_model",
-                "custom_action_dist": "binary_autoreg_output",
-            },
-        })
+    ModelCatalog.register_custom_model(
+        "autoregressive_model", TorchAutoregressiveActionModel
+        if args.torch else AutoregressiveActionModel)
+    ModelCatalog.register_custom_action_dist(
+        "binary_autoreg_dist", TorchBinaryAutoregressiveDistribution
+        if args.torch else BinaryAutoregressiveDistribution)
+
+    config = {
+        "env": CorrelatedActionsEnv,
+        "gamma": 0.5,
+        "num_gpus": 0,
+        "model": {
+            "custom_model": "autoregressive_model",
+            "custom_action_dist": "binary_autoreg_dist",
+        },
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(args.run, stop=stop, config=config)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()
@@ -4,148 +4,28 @@ import argparse

 import ray
 from ray import tune
+from ray.rllib.examples.models.batch_norm_model import BatchNormModel, \
+    TorchBatchNormModel
 from ray.rllib.models import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.tf.misc import normc_initializer
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.annotations import override
+from ray.rllib.utils.test_utils import check_learning_achieved

 tf = try_import_tf()

 parser = argparse.ArgumentParser()
-parser.add_argument("--num-iters", type=int, default=200)
 parser.add_argument("--run", type=str, default="PPO")
-
-
-class BatchNormModel(TFModelV2):
-    """Example of a TFModelV2 that is built w/o using tf.keras.
-
-    NOTE: This example does not work when using a keras-based TFModelV2 due
-    to a bug in keras related to missing values for input placeholders, even
-    though these input values have been provided in a forward pass through the
-    actual keras Model.
-
-    All Model logic (layers) is defined in the `forward` method (incl.
-    the batch_normalization layers). Also, all variables are registered
-    (only once) at the end of `forward`, so an optimizer knows which tensors
-    to train on. A standard `value_function` override is used.
-    """
-    capture_index = 0
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        super().__init__(obs_space, action_space, num_outputs, model_config,
-                         name)
-        # Have we registered our vars yet (see `forward`)?
-        self._registered = False
-
-    @override(ModelV2)
-    def forward(self, input_dict, state, seq_lens):
-        last_layer = input_dict["obs"]
-        hiddens = [256, 256]
-        with tf.variable_scope("model", reuse=tf.AUTO_REUSE):
-            for i, size in enumerate(hiddens):
-                last_layer = tf.layers.dense(
-                    last_layer,
-                    size,
-                    kernel_initializer=normc_initializer(1.0),
-                    activation=tf.nn.tanh,
-                    name="fc{}".format(i))
-                # Add a batch norm layer
-                last_layer = tf.layers.batch_normalization(
-                    last_layer,
-                    training=input_dict["is_training"],
-                    name="bn_{}".format(i))
-
-            output = tf.layers.dense(
-                last_layer,
-                self.num_outputs,
-                kernel_initializer=normc_initializer(0.01),
-                activation=None,
-                name="out")
-            self._value_out = tf.layers.dense(
-                last_layer,
-                1,
-                kernel_initializer=normc_initializer(1.0),
-                activation=None,
-                name="vf")
-        if not self._registered:
-            self.register_variables(
-                tf.get_collection(
-                    tf.GraphKeys.TRAINABLE_VARIABLES, scope=".+/model/.+"))
-            self._registered = True
-
-        return output, []
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
-
-class KerasBatchNormModel(TFModelV2):
-    """Keras version of above BatchNormModel with exactly the same structure.
-
-    IMORTANT NOTE: This model will not work with PPO due to a bug in keras
-    that surfaces when having more than one input placeholder (here: `inputs`
-    and `is_training`) AND using the `make_tf_callable` helper (e.g. used by
-    PPO), in which auto-placeholders are generated, then passed through the
-    tf.keras. models.Model. In this last step, the connection between 1) the
-    provided value in the auto-placeholder and 2) the keras `is_training`
-    Input is broken and keras complains.
-    Use the above `BatchNormModel` (a non-keras based TFModelV2), instead.
-    """
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        super().__init__(obs_space, action_space, num_outputs, model_config,
-                         name)
-        inputs = tf.keras.layers.Input(shape=obs_space.shape, name="inputs")
-        is_training = tf.keras.layers.Input(
-            shape=(), dtype=tf.bool, batch_size=1, name="is_training")
-        last_layer = inputs
-        hiddens = [256, 256]
-        for i, size in enumerate(hiddens):
-            label = "fc{}".format(i)
-            last_layer = tf.keras.layers.Dense(
-                units=size,
-                kernel_initializer=normc_initializer(1.0),
-                activation=tf.nn.tanh,
-                name=label)(last_layer)
-            # Add a batch norm layer
-            last_layer = tf.keras.layers.BatchNormalization()(
-                last_layer, training=is_training[0])
-        output = tf.keras.layers.Dense(
-            units=self.num_outputs,
-            kernel_initializer=normc_initializer(0.01),
-            activation=None,
-            name="fc_out")(last_layer)
-        value_out = tf.keras.layers.Dense(
-            units=1,
-            kernel_initializer=normc_initializer(0.01),
-            activation=None,
-            name="value_out")(last_layer)
-
-        self.base_model = tf.keras.models.Model(
-            inputs=[inputs, is_training], outputs=[output, value_out])
-        self.register_variables(self.base_model.variables)
-
-    @override(ModelV2)
-    def forward(self, input_dict, state, seq_lens):
-        out, self._value_out = self.base_model(
-            [input_dict["obs"], input_dict["is_training"]])
-        return out, []
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=150)

 if __name__ == "__main__":
    args = parser.parse_args()
-    ray.init()
+    ray.init(local_mode=True)

-    ModelCatalog.register_custom_model("bn_model", BatchNormModel)
+    ModelCatalog.register_custom_model(
+        "bn_model", TorchBatchNormModel if args.torch else BatchNormModel)

    config = {
        "env": "Pendulum-v0" if args.run == "DDPG" else "CartPole-v0",
@@ -153,10 +33,18 @@ if __name__ == "__main__":
            "custom_model": "bn_model",
        },
        "num_workers": 0,
+        "use_pytorch": args.torch,
    }

-    tune.run(
-        args.run,
-        stop={"training_iteration": args.num_iters},
-        config=config,
-    )
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(args.run, stop=stop, config=config)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -1,13 +1,17 @@
 import argparse

 from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
-parser.add_argument("--stop", type=int, default=200)
-parser.add_argument("--torch", action="store_true")
-parser.add_argument("--use-prev-action-reward", action="store_true")
 parser.add_argument("--run", type=str, default="PPO")
 parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--use-prev-action-reward", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=150.0)

 if __name__ == "__main__":
    import ray
@@ -30,16 +34,24 @@ if __name__ == "__main__":
        },
    }

-    tune.run(
-        args.run,
-        stop={"episode_reward_mean": args.stop},
-        config=dict(
-            configs[args.run], **{
-                "env": StatelessCartPole,
-                "model": {
-                    "use_lstm": True,
-                    "lstm_use_prev_action_reward": args.use_prev_action_reward,
-                },
-                "use_pytorch": args.torch,
-            }),
-    )
+    config = dict(
+        configs[args.run], **{
+            "env": StatelessCartPole,
+            "model": {
+                "use_lstm": True,
+                "lstm_use_prev_action_reward": args.use_prev_action_reward,
+            },
+            "use_pytorch": args.torch,
+        })
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(args.run, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()
@@ -16,77 +16,53 @@ import argparse
 import numpy as np
 from gym.spaces import Discrete

+import ray
 from ray import tune
 from ray.rllib.agents.ppo.ppo import PPOTrainer
 from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy, KLCoeffMixin, \
-    PPOLoss
+    PPOLoss as TFLoss
+from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy, \
+    KLCoeffMixin as TorchKLCoeffMixin, PPOLoss as TorchLoss
 from ray.rllib.evaluation.postprocessing import compute_advantages, \
    Postprocessing
-from ray.rllib.examples.twostep_game import TwoStepGame
+from ray.rllib.examples.env.two_step_game import TwoStepGame
+from ray.rllib.examples.models.centralized_critic_models import \
+    CentralizedCriticModel, TorchCentralizedCriticModel
 from ray.rllib.models import ModelCatalog
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy import LearningRateSchedule, \
    EntropyCoeffSchedule
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
+from ray.rllib.policy.torch_policy import LearningRateSchedule as TorchLR, \
+    EntropyCoeffSchedule as TorchEntropyCoeffSchedule
 from ray.rllib.utils.explained_variance import explained_variance
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.rllib.utils.tf_ops import make_tf_callable
-from ray.rllib.utils import try_import_tf
+from ray.rllib.utils.torch_ops import convert_to_torch_tensor

 tf = try_import_tf()
+torch, nn = try_import_torch()

 OPPONENT_OBS = "opponent_obs"
 OPPONENT_ACTION = "opponent_action"

 parser = argparse.ArgumentParser()
-parser.add_argument("--stop", type=int, default=100000)
-
-
-class CentralizedCriticModel(TFModelV2):
-    """Multi-agent model that implements a centralized VF."""
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        super(CentralizedCriticModel, self).__init__(
-            obs_space, action_space, num_outputs, model_config, name)
-        # Base of the model
-        self.model = FullyConnectedNetwork(obs_space, action_space,
-                                           num_outputs, model_config, name)
-        self.register_variables(self.model.variables())
-
-        # Central VF maps (obs, opp_obs, opp_act) -> vf_pred
-        obs = tf.keras.layers.Input(shape=(6, ), name="obs")
-        opp_obs = tf.keras.layers.Input(shape=(6, ), name="opp_obs")
-        opp_act = tf.keras.layers.Input(shape=(2, ), name="opp_act")
-        concat_obs = tf.keras.layers.Concatenate(axis=1)(
-            [obs, opp_obs, opp_act])
-        central_vf_dense = tf.keras.layers.Dense(
-            16, activation=tf.nn.tanh, name="c_vf_dense")(concat_obs)
-        central_vf_out = tf.keras.layers.Dense(
-            1, activation=None, name="c_vf_out")(central_vf_dense)
-        self.central_vf = tf.keras.Model(
-            inputs=[obs, opp_obs, opp_act], outputs=central_vf_out)
-        self.register_variables(self.central_vf.variables)
-
-    def forward(self, input_dict, state, seq_lens):
-        return self.model.forward(input_dict, state, seq_lens)
-
-    def central_value_function(self, obs, opponent_obs, opponent_actions):
-        return tf.reshape(
-            self.central_vf(
-                [obs, opponent_obs,
-                 tf.one_hot(opponent_actions, 2)]), [-1])
-
-    def value_function(self):
-        return self.model.value_function()  # not used
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=100)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=7.99)


 class CentralizedValueMixin:
    """Add method to evaluate the central value function from the model."""

    def __init__(self):
-        self.compute_central_vf = make_tf_callable(self.get_session())(
-            self.model.central_value_function)
+        if not self.config["use_pytorch"]:
+            self.compute_central_vf = make_tf_callable(self.get_session())(
+                self.model.central_value_function)
+        else:
+            self.compute_central_vf = self.model.central_value_function


 # Grabs the opponent obs/act and includes it in the experience train_batch,
@@ -95,7 +71,9 @@ def centralized_critic_postprocessing(policy,
                                      sample_batch,
                                      other_agent_batches=None,
                                      episode=None):
-    if policy.loss_initialized():
+    pytorch = policy.config["use_pytorch"]
+    if (pytorch and hasattr(policy, "compute_central_vf")) or \
+            (not pytorch and policy.loss_initialized()):
        assert other_agent_batches is not None
        [(_, opponent_batch)] = list(other_agent_batches.values())

@@ -104,11 +82,18 @@ def centralized_critic_postprocessing(policy,
        sample_batch[OPPONENT_ACTION] = opponent_batch[SampleBatch.ACTIONS]

        # overwrite default VF prediction with the central VF
-        sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
-            sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
-            sample_batch[OPPONENT_ACTION])
+        if args.torch:
+            sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
+                convert_to_torch_tensor(sample_batch[SampleBatch.CUR_OBS]),
+                convert_to_torch_tensor(sample_batch[OPPONENT_OBS]),
+                convert_to_torch_tensor(sample_batch[OPPONENT_ACTION])). \
+                detach().numpy()
+        else:
+            sample_batch[SampleBatch.VF_PREDS] = policy.compute_central_vf(
+                sample_batch[SampleBatch.CUR_OBS], sample_batch[OPPONENT_OBS],
+                sample_batch[OPPONENT_ACTION])
    else:
-        # policy hasn't initialized yet, use zeros
+        # Policy hasn't been initialized yet, use zeros.
        sample_batch[OPPONENT_OBS] = np.zeros_like(
            sample_batch[SampleBatch.CUR_OBS])
        sample_batch[OPPONENT_ACTION] = np.zeros_like(
@@ -141,7 +126,13 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
        train_batch[SampleBatch.CUR_OBS], train_batch[OPPONENT_OBS],
        train_batch[OPPONENT_ACTION])

-    policy.loss_obj = PPOLoss(
+    func = TFLoss if not policy.config["use_pytorch"] else TorchLoss
+    adv = tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool) \
+        if not policy.config["use_pytorch"] else \
+        torch.ones_like(train_batch[Postprocessing.ADVANTAGES],
+                        dtype=torch.bool)
+
+    policy.loss_obj = func(
        dist_class,
        model,
        train_batch[Postprocessing.VALUE_TARGETS],
@@ -153,7 +144,7 @@ def loss_with_central_critic(policy, model, dist_class, train_batch):
        action_dist,
        policy.central_value_out,
        policy.kl_coeff,
-        tf.ones_like(train_batch[Postprocessing.ADVANTAGES], dtype=tf.bool),
+        adv,
        entropy_coeff=policy.entropy_coeff,
        clip_param=policy.config["clip_param"],
        vf_clip_param=policy.config["vf_clip_param"],
@@ -180,8 +171,8 @@ def central_vf_stats(policy, train_batch, grads):
    }


-CCPPO = PPOTFPolicy.with_updates(
-    name="CCPPO",
+CCPPOTFPolicy = PPOTFPolicy.with_updates(
+    name="CCPPOTFPolicy",
    postprocess_fn=centralized_critic_postprocessing,
    loss_fn=loss_with_central_critic,
    before_loss_init=setup_mixins,
@@ -191,31 +182,64 @@ CCPPO = PPOTFPolicy.with_updates(
        CentralizedValueMixin
    ])

+CCPPOTorchPolicy = PPOTorchPolicy.with_updates(
+    name="CCPPOTorchPolicy",
+    postprocess_fn=centralized_critic_postprocessing,
+    loss_fn=loss_with_central_critic,
+    before_init=setup_mixins,
+    mixins=[
+        TorchLR, TorchEntropyCoeffSchedule, TorchKLCoeffMixin,
+        CentralizedValueMixin
+    ])
+
+
+def get_policy_class(config):
+    return CCPPOTorchPolicy if config["use_pytorch"] else CCPPOTFPolicy
+
+
 CCTrainer = PPOTrainer.with_updates(
-    name="CCPPOTrainer", default_policy=CCPPO, get_policy_class=None)
+    name="CCPPOTrainer",
+    default_policy=CCPPOTFPolicy,
+    get_policy_class=get_policy_class,
+)

 if __name__ == "__main__":
+    ray.init(local_mode=True)
    args = parser.parse_args()
-    ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
-    tune.run(
-        CCTrainer,
-        stop={
-            "timesteps_total": args.stop,
-            "episode_reward_mean": 7.99,
+
+    ModelCatalog.register_custom_model(
+        "cc_model", TorchCentralizedCriticModel
+        if args.torch else CentralizedCriticModel)
+
+    config = {
+        "env": TwoStepGame,
+        "batch_mode": "complete_episodes",
+        "eager": False,
+        "num_workers": 0,
+        "multiagent": {
+            "policies": {
+                "pol1": (None, Discrete(6), TwoStepGame.action_space, {
+                    "use_pytorch": args.torch
+                }),
+                "pol2": (None, Discrete(6), TwoStepGame.action_space, {
+                    "use_pytorch": args.torch
+                }),
+            },
+            "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
        },
-        config={
-            "env": TwoStepGame,
-            "batch_mode": "complete_episodes",
-            "eager": False,
-            "num_workers": 0,
-            "multiagent": {
-                "policies": {
-                    "pol1": (None, Discrete(6), TwoStepGame.action_space, {}),
-                    "pol2": (None, Discrete(6), TwoStepGame.action_space, {}),
-                },
-                "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
-            },
-            "model": {
-                "custom_model": "cc_model",
-            },
-        })
+        "model": {
+            "custom_model": "cc_model",
+        },
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(CCTrainer, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
@@ -10,64 +10,24 @@ modifies the policy to add a centralized value function.
 """

 import numpy as np
-from gym.spaces import Box, Dict, Discrete
+from gym.spaces import Dict, Discrete
 import argparse

 from ray import tune
 from ray.rllib.agents.callbacks import DefaultCallbacks
-from ray.rllib.examples.twostep_game import TwoStepGame
+from ray.rllib.examples.models.centralized_critic_models import \
+    YetAnotherCentralizedCriticModel, YetAnotherTorchCentralizedCriticModel
+from ray.rllib.examples.env.two_step_game import TwoStepGame
 from ray.rllib.models import ModelCatalog
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
-parser.add_argument("--stop", type=int, default=100000)
-
-
-class CentralizedCriticModel(TFModelV2):
-    """Multi-agent model that implements a centralized VF.
-
-    It assumes the observation is a dict with 'own_obs' and 'opponent_obs', the
-    former of which can be used for computing actions (i.e., decentralized
-    execution), and the latter for optimization (i.e., centralized learning).
-
-    This model has two parts:
-    - An action model that looks at just 'own_obs' to compute actions
-    - A value model that also looks at the 'opponent_obs' / 'opponent_action'
-      to compute the value (it does this by using the 'obs_flat' tensor).
-    """
-
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        super(CentralizedCriticModel, self).__init__(
-            obs_space, action_space, num_outputs, model_config, name)
-
-        self.action_model = FullyConnectedNetwork(
-            Box(low=0, high=1, shape=(6, )),  # one-hot encoded Discrete(6)
-            action_space,
-            num_outputs,
-            model_config,
-            name + "_action")
-        self.register_variables(self.action_model.variables())
-
-        self.value_model = FullyConnectedNetwork(obs_space, action_space, 1,
-                                                 model_config, name + "_vf")
-        self.register_variables(self.value_model.variables())
-
-    def forward(self, input_dict, state, seq_lens):
-        self._value_out, _ = self.value_model({
-            "obs": input_dict["obs_flat"]
-        }, state, seq_lens)
-        return self.action_model({
-            "obs": input_dict["obs"]["own_obs"]
-        }, state, seq_lens)
-
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=100)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=7.99)


 class FillInActions(DefaultCallbacks):
@@ -109,7 +69,11 @@ def central_critic_observer(agent_obs, **kw):

 if __name__ == "__main__":
    args = parser.parse_args()
-    ModelCatalog.register_custom_model("cc_model", CentralizedCriticModel)
+
+    ModelCatalog.register_custom_model(
+        "cc_model", YetAnotherTorchCentralizedCriticModel
+        if args.torch else YetAnotherCentralizedCriticModel)
+
    action_space = Discrete(2)
    observer_space = Dict({
        "own_obs": Discrete(6),
@@ -118,26 +82,33 @@ if __name__ == "__main__":
        "opponent_obs": Discrete(6),
        "opponent_action": Discrete(2),
    })
-    tune.run(
-        "PPO",
-        stop={
-            "timesteps_total": args.stop,
-            "episode_reward_mean": 7.99,
+
+    config = {
+        "env": TwoStepGame,
+        "batch_mode": "complete_episodes",
+        "callbacks": FillInActions,
+        "num_workers": 0,
+        "multiagent": {
+            "policies": {
+                "pol1": (None, observer_space, action_space, {}),
+                "pol2": (None, observer_space, action_space, {}),
+            },
+            "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
+            "observation_fn": central_critic_observer,
        },
-        config={
-            "env": TwoStepGame,
-            "batch_mode": "complete_episodes",
-            "callbacks": FillInActions,
-            "num_workers": 0,
-            "multiagent": {
-                "policies": {
-                    "pol1": (None, observer_space, action_space, {}),
-                    "pol2": (None, observer_space, action_space, {}),
-                },
-                "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
-                "observation_fn": central_critic_observer,
-            },
-            "model": {
-                "custom_model": "cc_model",
-            },
-        })
+        "model": {
+            "custom_model": "cc_model",
+        },
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run("PPO", config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
@@ -7,20 +7,32 @@ This example shows:

 You can visualize experiment results in ~/ray_results using TensorBoard.
 """
-
+import argparse
 import numpy as np
 import gym
-from ray.rllib.models import ModelCatalog
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
 from gym.spaces import Discrete, Box

 import ray
 from ray import tune
-from ray.rllib.utils import try_import_tf
 from ray.tune import grid_search
+from ray.rllib.models import ModelCatalog
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.test_utils import check_learning_achieved

 tf = try_import_tf()
+torch, nn = try_import_torch()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--run", type=str, default="PPO")
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=50)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=0.1)


 class SimpleCorridor(gym.Env):
@@ -46,11 +58,11 @@ class SimpleCorridor(gym.Env):
        elif action == 1:
            self.cur_pos += 1
        done = self.cur_pos >= self.end_pos
-        return [self.cur_pos], 1 if done else 0, done, {}
+        return [self.cur_pos], 1.0 if done else -0.1, done, {}


 class CustomModel(TFModelV2):
-    """Example of a custom model that just delegates to a fc-net."""
+    """Example of a keras custom model that just delegates to an fc-net."""

    def __init__(self, obs_space, action_space, num_outputs, model_config,
                 name):
@@ -67,26 +79,58 @@ class CustomModel(TFModelV2):
        return self.model.value_function()


+class TorchCustomModel(TorchModelV2, nn.Module):
+    """Example of a PyTorch custom model that just delegates to a fc-net."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config,
+                 name):
+        TorchModelV2.__init__(self, obs_space, action_space, num_outputs,
+                              model_config, name)
+        nn.Module.__init__(self)
+
+        self.torch_sub_model = TorchFC(obs_space, action_space, num_outputs,
+                                       model_config, name)
+
+    def forward(self, input_dict, state, seq_lens):
+        input_dict["obs"] = input_dict["obs"].float()
+        fc_out, _ = self.torch_sub_model(input_dict, state, seq_lens)
+        return fc_out, []
+
+    def value_function(self):
+        return torch.reshape(self.torch_sub_model.value_function(), [-1])
+
+
 if __name__ == "__main__":
+    args = parser.parse_args()
+    ray.init()
+
    # Can also register the env creator function explicitly with:
    # register_env("corridor", lambda config: SimpleCorridor(config))
-    ray.init()
-    ModelCatalog.register_custom_model("my_model", CustomModel)
-    tune.run(
-        "PPO",
-        stop={
-            "timesteps_total": 10000,
+    ModelCatalog.register_custom_model(
+        "my_model", TorchCustomModel if args.torch else CustomModel)
+
+    config = {
+        "env": SimpleCorridor,  # or "corridor" if registered above
+        "env_config": {
+            "corridor_length": 5,
        },
-        config={
-            "env": SimpleCorridor,  # or "corridor" if registered above
-            "model": {
-                "custom_model": "my_model",
-            },
-            "vf_share_layers": True,
-            "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
-            "num_workers": 1,  # parallelism
-            "env_config": {
-                "corridor_length": 5,
-            },
+        "model": {
+            "custom_model": "my_model",
        },
-    )
+        "vf_share_layers": True,
+        "lr": grid_search([1e-2, 1e-4, 1e-6]),  # try different lrs
+        "num_workers": 1,  # parallelism
+        "use_pytorch": args.torch
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(args.run, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()
@@ -74,9 +74,9 @@ from ray.rllib.evaluation.metrics import collect_episodes, summarize_episodes
 from ray.rllib.examples.env.simple_corridor import SimpleCorridor

 parser = argparse.ArgumentParser()
-parser.add_argument("--custom-eval", action="store_true")
 parser.add_argument("--num-cpus", type=int, default=0)
 parser.add_argument("--torch", action="store_true")
+parser.add_argument("--no-custom-eval", action="store_true")


 def custom_eval_function(trainer, eval_workers):
@@ -124,48 +124,51 @@ def custom_eval_function(trainer, eval_workers):
 if __name__ == "__main__":
    args = parser.parse_args()

-    if args.custom_eval:
-        eval_fn = custom_eval_function
-    else:
+    if args.no_custom_eval:
        eval_fn = None
+    else:
+        eval_fn = custom_eval_function

    ray.init(num_cpus=args.num_cpus or None)

-    tune.run(
-        "PG",
-        stop={
-            "training_iteration": 10,
+    config = {
+        "env": SimpleCorridor,
+        "env_config": {
+            "corridor_length": 10,
        },
-        config={
-            "env": SimpleCorridor,
+        "horizon": 20,
+        "log_level": "INFO",
+
+        # Training rollouts will be collected using just the learner
+        # process, but evaluation will be done in parallel with two
+        # workers. Hence, this run will use 3 CPUs total (1 for the
+        # learner + 2 more for evaluation workers).
+        "num_workers": 0,
+        "evaluation_num_workers": 2,
+
+        # Optional custom eval function.
+        "custom_eval_function": eval_fn,
+
+        # Enable evaluation, once per training iteration.
+        "evaluation_interval": 1,
+
+        # Run 10 episodes each time evaluation runs.
+        "evaluation_num_episodes": 10,
+
+        # Override the env config for evaluation.
+        "evaluation_config": {
            "env_config": {
-                "corridor_length": 10,
+                # Evaluate using LONGER corridor than trained on.
+                "corridor_length": 5,
            },
-            "horizon": 20,
-            "log_level": "INFO",
+        },
+        "use_pytorch": args.torch,
+    }

-            # Training rollouts will be collected using just the learner
-            # process, but evaluation will be done in parallel with two
-            # workers. Hence, this run will use 3 CPUs total (1 for the
-            # learner + 2 more for evaluation workers).
-            "num_workers": 0,
-            "evaluation_num_workers": 2,
+    stop = {
+        "training_iteration": 10,
+    }

-            # Optional custom eval function.
-            "custom_eval_function": eval_fn,
+    tune.run("PG", config=config, stop=stop)

-            # Enable evaluation, once per training iteration.
-            "evaluation_interval": 1,
-
-            # Run 10 episodes each time evaluation runs.
-            "evaluation_num_episodes": 10,
-
-            # Override the env config for evaluation.
-            "evaluation_config": {
-                "env_config": {
-                    # Evaluate using LONGER corridor than trained on.
-                    "corridor_length": 5,
-                },
-            },
-            "use_pytorch": args.torch,
-        })
+    ray.shutdown()
@@ -4,48 +4,52 @@ Both the model and env are trivial (and super-fast), so they are useful
 for running perf microbenchmarks.
 """

+import argparse
+
 import ray
+import ray.tune as tune
+from ray.tune import sample_from
 from ray.rllib.examples.env.fast_image_env import FastImageEnv
-from ray.rllib.models import Model, ModelCatalog
-from ray.tune import run_experiments, sample_from
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-
-class FastModel(Model):
-    def _build_layers_v2(self, input_dict, num_outputs, options):
-        bias = tf.get_variable(
-            dtype=tf.float32,
-            name="bias",
-            initializer=tf.zeros_initializer,
-            shape=())
-        output = bias + tf.zeros([tf.shape(input_dict["obs"])[0], num_outputs])
-        return output, output
+from ray.rllib.examples.models.fast_model import FastModel, TorchFastModel
+from ray.rllib.models import ModelCatalog

+parser = argparse.ArgumentParser()
+parser.add_argument("--num-cpus", type=int, default=2)
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-timesteps", type=int, default=100000)

 if __name__ == "__main__":
-    ray.init()
-    ModelCatalog.register_custom_model("fast_model", FastModel)
-    run_experiments({
-        "demo": {
-            "run": "IMPALA",
-            "env": FastImageEnv,
-            "config": {
-                "compress_observations": True,
-                "model": {
-                    "custom_model": "fast_model"
-                },
-                "num_gpus": 0,
-                "num_workers": 2,
-                "num_envs_per_worker": 10,
-                "num_data_loader_buffers": 1,
-                "num_aggregation_workers": 1,
-                "broadcast_interval": 50,
-                "rollout_fragment_length": 100,
-                "train_batch_size": sample_from(
-                    lambda spec: 1000 * max(1, spec.config.num_gpus)),
-                "fake_sampler": True,
-            },
+    args = parser.parse_args()
+    ray.init(num_cpus=args.num_cpus or None)
+
+    ModelCatalog.register_custom_model(
+        "fast_model", TorchFastModel if args.torch else FastModel)
+
+    config = {
+        "env": FastImageEnv,
+        "compress_observations": True,
+        "model": {
+            "custom_model": "fast_model"
        },
-    })
+        "num_gpus": 0,
+        "num_workers": 2,
+        "num_envs_per_worker": 10,
+        "num_data_loader_buffers": 1,
+        "num_aggregation_workers": 1,
+        "broadcast_interval": 50,
+        "rollout_fragment_length": 100,
+        "train_batch_size": sample_from(
+            lambda spec: 1000 * max(1, spec.config.num_gpus)),
+        "fake_sampler": True,
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+    }
+
+    tune.run("IMPALA", config=config, stop=stop)
+
+    ray.shutdown()
@@ -1,115 +0,0 @@
-# Explains/tests Issues:
-# https://github.com/ray-project/ray/issues/6928
-# https://github.com/ray-project/ray/issues/6732
-
-from gym.spaces import Discrete, Box
-import numpy as np
-
-from ray.rllib.agents.ppo import PPOTrainer
-from ray.rllib.examples.random_env import RandomEnv
-from ray.rllib.models import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.annotations import override
-
-tf = try_import_tf()
-
-cnn_shape = (4, 4, 3)
-
-
-class CustomModel(RecurrentTFModelV2):
-    def __init__(self, obs_space, action_space, num_outputs, model_config,
-                 name):
-        super(CustomModel, self).__init__(obs_space, action_space, num_outputs,
-                                          model_config, name)
-
-        self.cell_size = 16
-        visual_size = cnn_shape[0] * cnn_shape[1] * cnn_shape[2]
-
-        state_in_h = tf.keras.layers.Input(shape=(self.cell_size, ), name="h")
-        state_in_c = tf.keras.layers.Input(shape=(self.cell_size, ), name="c")
-        seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
-
-        inputs = tf.keras.layers.Input(
-            shape=(None, visual_size), name="visual_inputs")
-
-        input_visual = inputs
-        input_visual = tf.reshape(
-            input_visual, [-1, cnn_shape[0], cnn_shape[1], cnn_shape[2]])
-        cnn_input = tf.keras.layers.Input(shape=cnn_shape, name="cnn_input")
-
-        cnn_model = tf.keras.applications.mobilenet_v2.MobileNetV2(
-            alpha=1.0,
-            include_top=True,
-            weights=None,
-            input_tensor=cnn_input,
-            pooling=None)
-        vision_out = cnn_model(input_visual)
-        vision_out = tf.reshape(
-            vision_out,
-            [-1, tf.shape(inputs)[1],
-             vision_out.shape.as_list()[-1]])
-
-        lstm_out, state_h, state_c = tf.keras.layers.LSTM(
-            self.cell_size,
-            return_sequences=True,
-            return_state=True,
-            name="lstm")(
-                inputs=vision_out,
-                mask=tf.sequence_mask(seq_in),
-                initial_state=[state_in_h, state_in_c])
-
-        # Postprocess LSTM output with another hidden layer and compute values.
-        logits = tf.keras.layers.Dense(
-            self.num_outputs,
-            activation=tf.keras.activations.linear,
-            name="logits")(lstm_out)
-        values = tf.keras.layers.Dense(
-            1, activation=None, name="values")(lstm_out)
-
-        # Create the RNN model
-        self.rnn_model = tf.keras.Model(
-            inputs=[inputs, seq_in, state_in_h, state_in_c],
-            outputs=[logits, values, state_h, state_c])
-        self.register_variables(self.rnn_model.variables)
-        self.rnn_model.summary()
-
-    @override(RecurrentTFModelV2)
-    def forward_rnn(self, inputs, state, seq_lens):
-        model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
-                                                          state)
-        return model_out, [h, c]
-
-    @override(ModelV2)
-    def get_initial_state(self):
-        return [
-            np.zeros(self.cell_size, np.float32),
-            np.zeros(self.cell_size, np.float32),
-        ]
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
-
-if __name__ == "__main__":
-    ModelCatalog.register_custom_model("my_model", CustomModel)
-    trainer = PPOTrainer(
-        env=RandomEnv,
-        config={
-            # "eager": True,  # <- should work for both eager or not
-            "model": {
-                "custom_model": "my_model",
-                "max_seq_len": 20,
-            },
-            "vf_share_layers": True,
-            "num_workers": 0,  # no parallelism
-            "env_config": {
-                "action_space": Discrete(2),
-                # Test a simple Tuple observation space.
-                "observation_space": Box(
-                    0.0, 1.0, shape=cnn_shape, dtype=np.float32)
-            }
-        })
-    trainer.train()
@@ -17,7 +17,7 @@ tf = try_import_tf()
 parser = argparse.ArgumentParser()
 parser.add_argument("--run", type=str, default="DQN")  # Try PG, PPO, DQN
 parser.add_argument("--stop", type=int, default=200)
-parser.add_argument("--use_vision_network", action="store_true")
+parser.add_argument("--use-vision-network", action="store_true")
 parser.add_argument("--num-cpus", type=int, default=0)


@@ -1,117 +0,0 @@
-"""Example of using a custom RNN keras model."""
-
-import argparse
-import numpy as np
-
-import ray
-from ray import tune
-from ray.tune.registry import register_env
-from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
-from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
-from ray.rllib.models import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.tf.recurrent_tf_modelv2 import RecurrentTFModelV2
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--run", type=str, default="PPO")
-parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
-parser.add_argument("--stop", type=int, default=90)
-parser.add_argument("--num-cpus", type=int, default=0)
-
-
-class MyKerasRNN(RecurrentTFModelV2):
-    """Example of using the Keras functional API to define a RNN model."""
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 hiddens_size=256,
-                 cell_size=64):
-        super(MyKerasRNN, self).__init__(obs_space, action_space, num_outputs,
-                                         model_config, name)
-        self.cell_size = cell_size
-
-        # Define input layers
-        input_layer = tf.keras.layers.Input(
-            shape=(None, obs_space.shape[0]), name="inputs")
-        state_in_h = tf.keras.layers.Input(shape=(cell_size, ), name="h")
-        state_in_c = tf.keras.layers.Input(shape=(cell_size, ), name="c")
-        seq_in = tf.keras.layers.Input(shape=(), name="seq_in", dtype=tf.int32)
-
-        # Preprocess observation with a hidden layer and send to LSTM cell
-        dense1 = tf.keras.layers.Dense(
-            hiddens_size, activation=tf.nn.relu, name="dense1")(input_layer)
-        lstm_out, state_h, state_c = tf.keras.layers.LSTM(
-            cell_size, return_sequences=True, return_state=True, name="lstm")(
-                inputs=dense1,
-                mask=tf.sequence_mask(seq_in),
-                initial_state=[state_in_h, state_in_c])
-
-        # Postprocess LSTM output with another hidden layer and compute values
-        logits = tf.keras.layers.Dense(
-            self.num_outputs,
-            activation=tf.keras.activations.linear,
-            name="logits")(lstm_out)
-        values = tf.keras.layers.Dense(
-            1, activation=None, name="values")(lstm_out)
-
-        # Create the RNN model
-        self.rnn_model = tf.keras.Model(
-            inputs=[input_layer, seq_in, state_in_h, state_in_c],
-            outputs=[logits, values, state_h, state_c])
-        self.register_variables(self.rnn_model.variables)
-        self.rnn_model.summary()
-
-    @override(RecurrentTFModelV2)
-    def forward_rnn(self, inputs, state, seq_lens):
-        model_out, self._value_out, h, c = self.rnn_model([inputs, seq_lens] +
-                                                          state)
-        return model_out, [h, c]
-
-    @override(ModelV2)
-    def get_initial_state(self):
-        return [
-            np.zeros(self.cell_size, np.float32),
-            np.zeros(self.cell_size, np.float32),
-        ]
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-    ray.init(num_cpus=args.num_cpus or None)
-    ModelCatalog.register_custom_model("rnn", MyKerasRNN)
-    register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
-    register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())
-
-    config = {
-        "env": args.env,
-        "env_config": {
-            "repeat_delay": 2,
-        },
-        "gamma": 0.9,
-        "num_workers": 0,
-        "num_envs_per_worker": 20,
-        "entropy_coeff": 0.001,
-        "num_sgd_iter": 5,
-        "vf_loss_coeff": 1e-5,
-        "model": {
-            "custom_model": "rnn",
-            "max_seq_len": 20,
-        },
-    }
-    tune.run(
-        args.run,
-        config=config,
-        stop={"episode_reward_mean": args.stop},
-    )
@@ -16,17 +16,16 @@ import os

 import ray
 from ray import tune
-from ray.rllib.models import Model, ModelCatalog
-from ray.rllib.models.tf.tf_action_dist import Categorical
-from ray.rllib.models.tf.fcnet_v1 import FullyConnectedNetwork
-from ray.rllib.models.model import restore_original_dimensions
-from ray.rllib.offline import JsonReader
+from ray.rllib.examples.models.custom_loss_model import CustomLossModel, \
+    TorchCustomLossModel
+from ray.rllib.models import ModelCatalog
 from ray.rllib.utils import try_import_tf

 tf = try_import_tf()

 parser = argparse.ArgumentParser()
-parser.add_argument("--iters", type=int, default=200)
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
 parser.add_argument(
    "--input-files",
    type=str,
@@ -34,50 +33,6 @@ parser.add_argument(
        os.path.dirname(os.path.abspath(__file__)),
        "../tests/data/cartpole_small"))

-
-class CustomLossModel(Model):
-    """Custom model that adds an imitation loss on top of the policy loss."""
-
-    def _build_layers_v2(self, input_dict, num_outputs, options):
-        self.obs_in = input_dict["obs"]
-        with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
-            self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
-                                               self.action_space, num_outputs,
-                                               options)
-        return self.fcnet.outputs, self.fcnet.last_layer
-
-    def custom_loss(self, policy_loss, loss_inputs):
-        # create a new input reader per worker
-        reader = JsonReader(self.options["custom_options"]["input_files"])
-        input_ops = reader.tf_input_ops()
-
-        # define a secondary loss by building a graph copy with weight sharing
-        obs = tf.cast(input_ops["obs"], tf.float32)
-        logits, _ = self._build_layers_v2({
-            "obs": restore_original_dimensions(obs, self.obs_space)
-        }, self.num_outputs, self.options)
-
-        # You can also add self-supervised losses easily by referencing tensors
-        # created during _build_layers_v2(). For example, an autoencoder-style
-        # loss can be added as follows:
-        # ae_loss = squared_diff(
-        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
-        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
-
-        # compute the IL loss
-        action_dist = Categorical(logits, self.options)
-        self.policy_loss = policy_loss
-        self.imitation_loss = tf.reduce_mean(
-            -action_dist.logp(input_ops["actions"]))
-        return policy_loss + 10 * self.imitation_loss
-
-    def custom_stats(self):
-        return {
-            "policy_loss": self.policy_loss,
-            "imitation_loss": self.imitation_loss,
-        }
-
-
 if __name__ == "__main__":
    ray.init()
    args = parser.parse_args()
@@ -90,20 +45,23 @@ if __name__ == "__main__":
        input_dir = rllib_dir.absolute().joinpath(args.input_files)
        args.input_files = str(input_dir)

-    ModelCatalog.register_custom_model("custom_loss", CustomLossModel)
-    tune.run(
-        "PG",
-        stop={
-            "training_iteration": args.iters,
-        },
-        config={
-            "env": "CartPole-v0",
-            "num_workers": 0,
-            "model": {
-                "custom_model": "custom_loss",
-                "custom_options": {
-                    "input_files": args.input_files,
-                },
+    ModelCatalog.register_custom_model(
+        "custom_loss", TorchCustomLossModel if args.torch else CustomLossModel)
+
+    config = {
+        "env": "CartPole-v0",
+        "num_workers": 0,
+        "model": {
+            "custom_model": "custom_loss",
+            "custom_options": {
+                "input_files": args.input_files,
            },
        },
-    )
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+    }
+
+    tune.run("PG", config=config, stop=stop)
@@ -64,14 +64,14 @@ class MyCallbacks(DefaultCallbacks):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--num-iters", type=int, default=2000)
+    parser.add_argument("--stop-iters", type=int, default=2000)
    args = parser.parse_args()

    ray.init()
    trials = tune.run(
        "PG",
        stop={
-            "training_iteration": args.num_iters,
+            "training_iteration": args.stop_iters,
        },
        config={
            "env": "CartPole-v0",
@@ -53,14 +53,14 @@ def on_postprocess_traj(info):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--num-iters", type=int, default=2000)
+    parser.add_argument("--stop-iters", type=int, default=2000)
    args = parser.parse_args()

    ray.init()
    trials = tune.run(
        "PG",
        stop={
-            "training_iteration": args.num_iters,
+            "training_iteration": args.stop_iters,
        },
        config={
            "env": "CartPole-v0",
@@ -0,0 +1,62 @@
+"""Example of using a custom RNN keras model."""
+
+import argparse
+
+import ray
+from ray import tune
+from ray.tune.registry import register_env
+from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
+from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
+from ray.rllib.examples.models.rnn_model import RNNModel, TorchRNNModel
+from ray.rllib.models import ModelCatalog
+from ray.rllib.utils.test_utils import check_learning_achieved
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--run", type=str, default="PPO")
+parser.add_argument("--env", type=str, default="RepeatAfterMeEnv")
+parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--stop-reward", type=float, default=90)
+parser.add_argument("--stop-iters", type=int, default=100)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    ray.init(num_cpus=args.num_cpus or None)
+
+    ModelCatalog.register_custom_model(
+        "rnn", TorchRNNModel if args.torch else RNNModel)
+    register_env("RepeatAfterMeEnv", lambda c: RepeatAfterMeEnv(c))
+    register_env("RepeatInitialObsEnv", lambda _: RepeatInitialObsEnv())
+
+    config = {
+        "env": args.env,
+        "env_config": {
+            "repeat_delay": 2,
+        },
+        "gamma": 0.9,
+        "num_workers": 0,
+        "num_envs_per_worker": 20,
+        "entropy_coeff": 0.001,
+        "num_sgd_iter": 5,
+        "vf_loss_coeff": 1e-5,
+        "model": {
+            "custom_model": "rnn",
+            "max_seq_len": 20,
+        },
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(args.run, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()
@@ -10,7 +10,7 @@ from ray.rllib.utils import try_import_tf
 tf = try_import_tf()

 parser = argparse.ArgumentParser()
-parser.add_argument("--iters", type=int, default=200)
+parser.add_argument("--stop-iters", type=int, default=200)
 parser.add_argument("--num-cpus", type=int, default=0)


@@ -47,7 +47,7 @@ if __name__ == "__main__":
    ray.init(num_cpus=args.num_cpus or None)
    tune.run(
        MyTrainer,
-        stop={"training_iteration": args.iters},
+        stop={"training_iteration": args.stop_iters},
        config={
            "env": "CartPole-v0",
            "num_workers": 2,
@@ -7,7 +7,7 @@ from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.torch_policy_template import build_torch_policy

 parser = argparse.ArgumentParser()
-parser.add_argument("--iters", type=int, default=200)
+parser.add_argument("--stop-iters", type=int, default=200)
 parser.add_argument("--num-cpus", type=int, default=0)


@@ -33,7 +33,7 @@ if __name__ == "__main__":
    ray.init(num_cpus=args.num_cpus or None)
    tune.run(
        MyTrainer,
-        stop={"training_iteration": args.iters},
+        stop={"training_iteration": args.stop_iters},
        config={
            "env": "CartPole-v0",
            "num_workers": 2,
@@ -1,128 +0,0 @@
-import argparse
-
-import ray
-from ray.rllib.examples.env.repeat_initial_obs_env import RepeatInitialObsEnv
-from ray.rllib.examples.env.repeat_after_me_env import RepeatAfterMeEnv
-from ray.rllib.examples.env.stateless_cartpole import StatelessCartPole
-from ray.rllib.models.preprocessors import get_preprocessor
-from ray.rllib.models.torch.recurrent_torch_model import RecurrentTorchModel
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.utils.annotations import override
-from ray.rllib.utils import try_import_torch
-from ray.rllib.models import ModelCatalog
-import ray.tune as tune
-
-torch, nn = try_import_torch()
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--run", type=str, default="PPO")
-parser.add_argument("--env", type=str, default="repeat_initial")
-parser.add_argument("--stop", type=int, default=90)
-parser.add_argument("--num-cpus", type=int, default=0)
-parser.add_argument("--fc-size", type=int, default=64)
-parser.add_argument("--lstm-cell-size", type=int, default=256)
-
-
-class RNNModel(RecurrentTorchModel):
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 fc_size=64,
-                 lstm_state_size=256):
-        super().__init__(obs_space, action_space, num_outputs, model_config,
-                         name)
-
-        self.obs_size = get_preprocessor(obs_space)(obs_space).size
-        self.fc_size = fc_size
-        self.lstm_state_size = lstm_state_size
-
-        # Build the Module from fc + LSTM + 2xfc (action + value outs).
-        self.fc1 = nn.Linear(self.obs_size, self.fc_size)
-        self.lstm = nn.LSTM(
-            self.fc_size, self.lstm_state_size, batch_first=True)
-        self.action_branch = nn.Linear(self.lstm_state_size, num_outputs)
-        self.value_branch = nn.Linear(self.lstm_state_size, 1)
-        # Store the value output to save an extra forward pass.
-        self._cur_value = None
-
-    @override(ModelV2)
-    def get_initial_state(self):
-        # make hidden states on same device as model
-        h = [
-            self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0),
-            self.fc1.weight.new(1, self.lstm_state_size).zero_().squeeze(0)
-        ]
-        return h
-
-    @override(ModelV2)
-    def value_function(self):
-        assert self._cur_value is not None, "must call forward() first"
-        return self._cur_value
-
-    @override(RecurrentTorchModel)
-    def forward_rnn(self, inputs, state, seq_lens):
-        """Feeds `inputs` (B x T x ..) through the Gru Unit.
-
-        Returns the resulting outputs as a sequence (B x T x ...).
-        Values are stored in self._cur_value in simple (B) shape (where B
-        contains both the B and T dims!).
-
-        Returns:
-            NN Outputs (B x T x ...) as sequence.
-            The state batches as a List of two items (c- and h-states).
-        """
-        x = nn.functional.relu(self.fc1(inputs))
-        lstm_out = self.lstm(
-            x, [torch.unsqueeze(state[0], 0),
-                torch.unsqueeze(state[1], 0)])
-        action_out = self.action_branch(lstm_out[0])
-        self._cur_value = torch.reshape(self.value_branch(lstm_out[0]), [-1])
-        return action_out, [
-            torch.squeeze(lstm_out[1][0], 0),
-            torch.squeeze(lstm_out[1][1], 0)
-        ]
-
-
-if __name__ == "__main__":
-    args = parser.parse_args()
-
-    ray.init(num_cpus=args.num_cpus or None)
-    ModelCatalog.register_custom_model("rnn", RNNModel)
-    tune.register_env(
-        "repeat_initial", lambda _: RepeatInitialObsEnv(episode_len=100))
-    tune.register_env(
-        "repeat_after_me", lambda _: RepeatAfterMeEnv({"repeat_delay": 1}))
-    tune.register_env("stateless_cartpole", lambda _: StatelessCartPole())
-
-    config = {
-        "env": args.env,
-        "use_pytorch": True,
-        "num_workers": 0,
-        "num_envs_per_worker": 20,
-        "gamma": 0.9,
-        "entropy_coeff": 0.0001,
-        "model": {
-            "custom_model": "rnn",
-            "max_seq_len": 20,
-            "lstm_use_prev_action_reward": "store_true",
-            "custom_options": {
-                "fc_size": args.fc_size,
-                "lstm_state_size": args.lstm_cell_size,
-            }
-        },
-        "lr": 3e-4,
-        "num_sgd_iter": 5,
-        "vf_loss_coeff": 0.0003,
-    }
-
-    tune.run(
-        args.run,
-        stop={
-            "episode_reward_mean": args.stop,
-            "timesteps_total": 100000
-        },
-        config=config,
-    )
@@ -4,70 +4,20 @@ import random
 import ray
 from ray import tune
 from ray.rllib.agents.trainer_template import build_trainer
+from ray.rllib.examples.models.eager_model import EagerModel
 from ray.rllib.models import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
 from ray.rllib.policy.sample_batch import SampleBatch
 from ray.rllib.policy.tf_policy_template import build_tf_policy
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.test_utils import check_learning_achieved

 tf = try_import_tf()

 parser = argparse.ArgumentParser()
-parser.add_argument("--iters", type=int, default=200)
-
-
-class EagerModel(TFModelV2):
-    """Example of using embedded eager execution in a custom model.
-
-    This shows how to use tf.py_function() to execute a snippet of TF code
-    in eager mode. Here the `self.forward_eager` method just prints out
-    the intermediate tensor for debug purposes, but you can in general
-    perform any TF eager operation in tf.py_function().
-    """
-
-    def __init__(self, observation_space, action_space, num_outputs,
-                 model_config, name):
-        super().__init__(observation_space, action_space, num_outputs,
-                         model_config, name)
-
-        inputs = tf.keras.layers.Input(shape=observation_space.shape)
-        self.fcnet = FullyConnectedNetwork(
-            obs_space=self.obs_space,
-            action_space=self.action_space,
-            num_outputs=self.num_outputs,
-            model_config=self.model_config,
-            name="fc1")
-        out, value_out = self.fcnet.base_model(inputs)
-
-        def lambda_(x):
-            eager_out = tf.py_function(self.forward_eager, [x], tf.float32)
-            with tf.control_dependencies([eager_out]):
-                eager_out.set_shape(x.shape)
-                return eager_out
-
-        out = tf.keras.layers.Lambda(lambda_)(out)
-        self.base_model = tf.keras.models.Model(inputs, [out, value_out])
-        self.register_variables(self.base_model.variables)
-
-    @override(ModelV2)
-    def forward(self, input_dict, state, seq_lens):
-        out, self._value_out = self.base_model(input_dict["obs"], state,
-                                               seq_lens)
-        return out, []
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
-    def forward_eager(self, feature_layer):
-        assert tf.executing_eagerly()
-        if random.random() > 0.99:
-            print("Eagerly printing the feature layer mean value",
-                  tf.reduce_mean(feature_layer))
-        return feature_layer
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
+parser.add_argument("--stop-reward", type=float, default=150)
+parser.add_argument("--as-test", action="store_true")


 def policy_gradient_loss(policy, model, dist_class, train_batch):
@@ -119,5 +69,14 @@ if __name__ == "__main__":
            "custom_model": "eager_model"
        },
    }
+    stop = {
+        "timesteps_total": args.stop_timesteps,
+        "training_iteration": args.stop_iters,
+        "episode_reward_mean": args.stop_reward,
+    }

-    tune.run(MyTrainer, stop={"training_iteration": args.iters}, config=config)
+    results = tune.run(MyTrainer, stop=stop, config=config)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()
@@ -15,9 +15,9 @@ class RockPaperScissors(MultiAgentEnv):
    SPOCK = 4

    def __init__(self, config):
-        self.action_space = Discrete(3)
-        self.observation_space = Discrete(3)
        self.sheldon_cooper = config.get("sheldon_cooper", False)
+        self.action_space = Discrete(5 if self.sheldon_cooper else 3)
+        self.observation_space = Discrete(5 if self.sheldon_cooper else 3)
        self.player1 = "player1"
        self.player2 = "player2"
        self.last_move = None
@@ -28,13 +28,16 @@ import logging

 import ray
 from ray import tune
+from ray.tune import function
 from ray.rllib.examples.env.windy_maze_env import WindyMazeEnv, \
    HierarchicalWindyMazeEnv
-from ray.tune import function
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
 parser.add_argument("--flat", action="store_true")
+parser.add_argument("--as-test", action="store_true")
 parser.add_argument("--torch", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
 parser.add_argument("--stop-reward", type=float, default=0.0)
 parser.add_argument("--stop-timesteps", type=int, default=100000)

@@ -45,8 +48,9 @@ if __name__ == "__main__":
    ray.init()

    stop = {
-        "episode_reward_mean": args.stop_reward,
+        "training_iteration": args.stop_iters,
        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
    }

    if args.flat:
@@ -92,15 +96,9 @@ if __name__ == "__main__":
            "use_pytorch": args.torch,
        }

-        results = tune.run(
-            "PPO",
-            stop=stop,
-            config=config,
-        )
+        results = tune.run("PPO", stop=stop, config=config)

-    # Error if stop-reward not reached.
-    if results.trials[0].last_result["episode_reward_mean"] < \
-            args.stop_reward:
-        raise ValueError("`stop-reward` of {} not reached!".format(
-            args.stop_reward))
-    print("ok")
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -0,0 +1,58 @@
+# Explains/tests Issues:
+# https://github.com/ray-project/ray/issues/6928
+# https://github.com/ray-project/ray/issues/6732
+
+import argparse
+from gym.spaces import Discrete, Box
+import numpy as np
+
+from ray.rllib.agents.ppo import PPOTrainer
+from ray.rllib.examples.env.random_env import RandomEnv
+from ray.rllib.examples.models.mobilenet_v2_with_lstm_models import \
+    MobileV2PlusRNNModel, TorchMobileV2PlusRNNModel
+from ray.rllib.models import ModelCatalog
+from ray.rllib.utils import try_import_tf
+
+tf = try_import_tf()
+
+cnn_shape = (4, 4, 3)
+# The torch version of MobileNetV2 does channels first.
+cnn_shape_torch = (3, 224, 224)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--torch", action="store_true")
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+
+    # Register our custom model.
+    ModelCatalog.register_custom_model(
+        "my_model", TorchMobileV2PlusRNNModel
+        if args.torch else MobileV2PlusRNNModel)
+
+    # Configure our Trainer.
+    config = {
+        "use_pytorch": args.torch,
+        "model": {
+            "custom_model": "my_model",
+            # Extra config passed to the custom model's c'tor as kwargs.
+            "custom_options": {
+                "cnn_shape": cnn_shape_torch if args.torch else cnn_shape,
+            },
+            "max_seq_len": 20,
+        },
+        "vf_share_layers": True,
+        "num_workers": 0,  # no parallelism
+        "env_config": {
+            "action_space": Discrete(2),
+            # Test a simple Image observation space.
+            "observation_space": Box(
+                0.0,
+                1.0,
+                shape=cnn_shape_torch if args.torch else cnn_shape,
+                dtype=np.float32)
+        },
+    }
+
+    trainer = PPOTrainer(config=config, env=RandomEnv)
+    print(trainer.train())
@@ -111,8 +111,10 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module):

        # Central VF maps (obs, opp_obs, opp_act) -> vf_pred
        input_size = 6 + 6 + 2  # obs + opp_obs + opp_act
-        self.central_vf_dense = SlimFC(input_size, 16, activation_fn=nn.Tanh)
-        self.central_vf_out = SlimFC(16, 1)
+        self.central_vf = nn.Sequential(
+            SlimFC(input_size, 16, activation_fn=nn.Tanh),
+            SlimFC(16, 1),
+        )

    @override(ModelV2)
    def forward(self, input_dict, state, seq_lens):
@@ -122,10 +124,9 @@ class TorchCentralizedCriticModel(TorchModelV2, nn.Module):
    def central_value_function(self, obs, opponent_obs, opponent_actions):
        input_ = torch.cat([
            obs, opponent_obs,
-            torch.nn.functional.one_hot(opponent_actions, 2)
+            torch.nn.functional.one_hot(opponent_actions, 2).float()
        ], 1)
-        return torch.reshape(
-            self.central_vf_out(self.central_vf_dense(input_)), [-1])
+        return torch.reshape(self.central_vf(input_), [-1])

    @override(ModelV2)
    def value_function(self):
@@ -0,0 +1,164 @@
+from ray.rllib.models.model import Model, restore_original_dimensions
+from ray.rllib.models.modelv2 import ModelV2
+from ray.rllib.models.tf.tf_action_dist import Categorical
+from ray.rllib.models.tf.tf_modelv2 import TFModelV2
+from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
+from ray.rllib.models.torch.torch_action_dist import TorchCategorical
+from ray.rllib.models.torch.torch_modelv2 import TorchModelV2
+from ray.rllib.models.torch.fcnet import FullyConnectedNetwork as TorchFC
+from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.offline import JsonReader
+
+tf = try_import_tf()
+torch, nn = try_import_torch()
+
+
+class CustomLossModel(TFModelV2):
+    """Custom model that adds an imitation loss on top of the policy loss."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config,
+                 name):
+        super().__init__(obs_space, action_space, num_outputs, model_config,
+                         name)
+
+        self.fcnet = FullyConnectedNetwork(
+            self.obs_space,
+            self.action_space,
+            num_outputs,
+            model_config,
+            name="fcnet")
+        self.register_variables(self.fcnet.variables())
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        # Delegate to our FCNet.
+        return self.fcnet(input_dict, state, seq_lens)
+
+    @override(ModelV2)
+    def custom_loss(self, policy_loss, loss_inputs):
+        # Create a new input reader per worker.
+        reader = JsonReader(self.model_config["custom_options"]["input_files"])
+        input_ops = reader.tf_input_ops()
+
+        # Define a secondary loss by building a graph copy with weight sharing.
+        obs = restore_original_dimensions(
+            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
+        logits, _ = self.forward({"obs": obs}, [], None)
+
+        # You can also add self-supervised losses easily by referencing tensors
+        # created during _build_layers_v2(). For example, an autoencoder-style
+        # loss can be added as follows:
+        # ae_loss = squared_diff(
+        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
+        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
+
+        # Compute the IL loss.
+        action_dist = Categorical(logits, self.model_config)
+        self.policy_loss = policy_loss
+        self.imitation_loss = tf.reduce_mean(
+            -action_dist.logp(input_ops["actions"]))
+        return policy_loss + 10 * self.imitation_loss
+
+    def custom_stats(self):
+        return {
+            "policy_loss": self.policy_loss,
+            "imitation_loss": self.imitation_loss,
+        }
+
+
+class DeprecatedCustomLossModelV1(Model):
+    """Model(V1) version of above custom-loss model."""
+
+    def _build_layers_v2(self, input_dict, num_outputs, options):
+        self.obs_in = input_dict["obs"]
+        with tf.variable_scope("shared", reuse=tf.AUTO_REUSE):
+            self.fcnet = FullyConnectedNetwork(input_dict, self.obs_space,
+                                               self.action_space, num_outputs,
+                                               options)
+        return self.fcnet.outputs, self.fcnet.last_layer
+
+    def custom_loss(self, policy_loss, loss_inputs):
+        # create a new input reader per worker
+        reader = JsonReader(self.options["custom_options"]["input_files"])
+        input_ops = reader.tf_input_ops()
+
+        # define a secondary loss by building a graph copy with weight sharing
+        obs = tf.cast(input_ops["obs"], tf.float32)
+        logits, _ = self._build_layers_v2({
+            "obs": restore_original_dimensions(obs, self.obs_space)
+        }, self.num_outputs, self.options)
+
+        # You can also add self-supervised losses easily by referencing tensors
+        # created during _build_layers_v2(). For example, an autoencoder-style
+        # loss can be added as follows:
+        # ae_loss = squared_diff(
+        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
+        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
+
+        # compute the IL loss
+        action_dist = Categorical(logits, self.options)
+        self.policy_loss = policy_loss
+        self.imitation_loss = tf.reduce_mean(
+            -action_dist.logp(input_ops["actions"]))
+        return policy_loss + 10 * self.imitation_loss
+
+    def custom_stats(self):
+        return {
+            "policy_loss": self.policy_loss,
+            "imitation_loss": self.imitation_loss,
+        }
+
+
+class TorchCustomLossModel(TorchModelV2, nn.Module):
+    """PyTorch version of the CustomLossModel above."""
+
+    def __init__(self, obs_space, action_space, num_outputs, model_config,
+                 name, input_files):
+        super().__init__(obs_space, action_space, num_outputs, model_config,
+                         name)
+        nn.Module.__init__(self)
+
+        self.input_files = input_files
+        self.fcnet = TorchFC(
+            self.obs_space,
+            self.action_space,
+            num_outputs,
+            model_config,
+            name="fcnet")
+
+    @override(ModelV2)
+    def forward(self, input_dict, state, seq_lens):
+        # Delegate to our FCNet.
+        return self.fcnet(input_dict, state, seq_lens)
+
+    @override(ModelV2)
+    def custom_loss(self, policy_loss, loss_inputs):
+        # Create a new input reader per worker.
+        reader = JsonReader(self.input_files)
+        input_ops = reader.tf_input_ops()
+
+        # Define a secondary loss by building a graph copy with weight sharing.
+        obs = restore_original_dimensions(
+            tf.cast(input_ops["obs"], tf.float32), self.obs_space)
+        logits, _ = self.forward({"obs": obs}, [], None)
+
+        # You can also add self-supervised losses easily by referencing tensors
+        # created during _build_layers_v2(). For example, an autoencoder-style
+        # loss can be added as follows:
+        # ae_loss = squared_diff(
+        #     loss_inputs["obs"], Decoder(self.fcnet.last_layer))
+        print("FYI: You can also use these tensors: {}, ".format(loss_inputs))
+
+        # Compute the IL loss.
+        action_dist = TorchCategorical(logits, self.model_config)
+        self.policy_loss = policy_loss
+        self.imitation_loss = torch.mean(
+            -action_dist.logp(input_ops["actions"]))
+        return policy_loss + 10 * self.imitation_loss
+
+    def custom_stats(self):
+        return {
+            "policy_loss": self.policy_loss,
+            "imitation_loss": self.imitation_loss,
+        }
@@ -16,11 +16,11 @@ import random
 import ray
 from ray import tune
 from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
+from ray.rllib.examples.models.shared_weights_model import \
+    SharedWeightsModel1, SharedWeightsModel2, TorchSharedWeightsModel
 from ray.rllib.models import ModelCatalog
-from ray.rllib.models.modelv2 import ModelV2
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.rllib.utils import try_import_tf
-from ray.rllib.utils.annotations import override
+from ray.rllib.utils.framework import try_import_tf
+from ray.rllib.utils.test_utils import check_learning_achieved

 tf = try_import_tf()

@@ -28,89 +28,31 @@ parser = argparse.ArgumentParser()

 parser.add_argument("--num-agents", type=int, default=4)
 parser.add_argument("--num-policies", type=int, default=2)
-parser.add_argument("--num-iters", type=int, default=20)
+parser.add_argument("--stop-iters", type=int, default=20)
+parser.add_argument("--stop-reward", type=float, default=150)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
 parser.add_argument("--simple", action="store_true")
 parser.add_argument("--num-cpus", type=int, default=0)
-
-
-class CustomModel1(TFModelV2):
-    def __init__(self, observation_space, action_space, num_outputs,
-                 model_config, name):
-        super().__init__(observation_space, action_space, num_outputs,
-                         model_config, name)
-
-        inputs = tf.keras.layers.Input(observation_space.shape)
-        # Example of (optional) weight sharing between two different policies.
-        # Here, we share the variables defined in the 'shared' variable scope
-        # by entering it explicitly with tf.AUTO_REUSE. This creates the
-        # variables for the 'fc1' layer in a global scope called 'shared'
-        # outside of the policy's normal variable scope.
-        with tf.variable_scope(
-                tf.VariableScope(tf.AUTO_REUSE, "shared"),
-                reuse=tf.AUTO_REUSE,
-                auxiliary_name_scope=False):
-            last_layer = tf.keras.layers.Dense(
-                units=64, activation=tf.nn.relu, name="fc1")(inputs)
-        output = tf.keras.layers.Dense(
-            units=num_outputs, activation=None, name="fc_out")(last_layer)
-        vf = tf.keras.layers.Dense(
-            units=1, activation=None, name="value_out")(last_layer)
-        self.base_model = tf.keras.models.Model(inputs, [output, vf])
-        self.register_variables(self.base_model.variables)
-
-    @override(ModelV2)
-    def forward(self, input_dict, state, seq_lens):
-        out, self._value_out = self.base_model(input_dict["obs"])
-        return out, []
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
-
-class CustomModel2(TFModelV2):
-    def __init__(self, observation_space, action_space, num_outputs,
-                 model_config, name):
-        super().__init__(observation_space, action_space, num_outputs,
-                         model_config, name)
-
-        inputs = tf.keras.layers.Input(observation_space.shape)
-
-        # Weights shared with CustomModel1.
-        with tf.variable_scope(
-                tf.VariableScope(tf.AUTO_REUSE, "shared"),
-                reuse=tf.AUTO_REUSE,
-                auxiliary_name_scope=False):
-            last_layer = tf.keras.layers.Dense(
-                units=64, activation=tf.nn.relu, name="fc1")(inputs)
-        output = tf.keras.layers.Dense(
-            units=num_outputs, activation=None, name="fc_out")(last_layer)
-        vf = tf.keras.layers.Dense(
-            units=1, activation=None, name="value_out")(last_layer)
-        self.base_model = tf.keras.models.Model(inputs, [output, vf])
-        self.register_variables(self.base_model.variables)
-
-    @override(ModelV2)
-    def forward(self, input_dict, state, seq_lens):
-        out, self._value_out = self.base_model(input_dict["obs"])
-        return out, []
-
-    @override(ModelV2)
-    def value_function(self):
-        return tf.reshape(self._value_out, [-1])
-
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--torch", action="store_true")

 if __name__ == "__main__":
    args = parser.parse_args()
+
    ray.init(num_cpus=args.num_cpus or None)

-    ModelCatalog.register_custom_model("model1", CustomModel1)
-    ModelCatalog.register_custom_model("model2", CustomModel2)
+    # Register the models to use.
+    mod1 = TorchSharedWeightsModel if args.torch else SharedWeightsModel1
+    mod2 = TorchSharedWeightsModel if args.torch else SharedWeightsModel2
+    ModelCatalog.register_custom_model("model1", mod1)
+    ModelCatalog.register_custom_model("model2", mod2)
+
+    # Get obs- and action Spaces.
    single_env = gym.make("CartPole-v0")
    obs_space = single_env.observation_space
    act_space = single_env.action_space

-    # Each policy can have a different configuration (including custom model)
+    # Each policy can have a different configuration (including custom model).
    def gen_policy(i):
        config = {
            "model": {
@@ -120,28 +62,35 @@ if __name__ == "__main__":
        }
        return (None, obs_space, act_space, config)

-    # Setup PPO with an ensemble of `num_policies` different policies
+    # Setup PPO with an ensemble of `num_policies` different policies.
    policies = {
        "policy_{}".format(i): gen_policy(i)
        for i in range(args.num_policies)
    }
    policy_ids = list(policies.keys())

-    tune.run(
-        "PPO",
-        stop={"training_iteration": args.num_iters},
-        config={
-            "env": MultiAgentCartPole,
-            "env_config": {
-                "num_agents": args.num_agents,
-            },
-            "log_level": "DEBUG",
-            "simple_optimizer": args.simple,
-            "num_sgd_iter": 10,
-            "multiagent": {
-                "policies": policies,
-                "policy_mapping_fn": (
-                    lambda agent_id: random.choice(policy_ids)),
-            },
+    config = {
+        "env": MultiAgentCartPole,
+        "env_config": {
+            "num_agents": args.num_agents,
        },
-    )
+        "log_level": "DEBUG",
+        "simple_optimizer": args.simple,
+        "num_sgd_iter": 10,
+        "multiagent": {
+            "policies": policies,
+            "policy_mapping_fn": (lambda agent_id: random.choice(policy_ids)),
+        },
+        "use_pytorch": args.torch,
+    }
+    stop = {
+        "episode_reward_mean": args.stop_reward,
+        "timesteps_total": args.stop_timesteps,
+        "training_iteration": args.stop_iters,
+    }
+
+    results = tune.run("PPO", stop=stop, config=config)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+    ray.shutdown()
@@ -18,32 +18,17 @@ import gym

 import ray
 from ray import tune
-from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
-from ray.rllib.policy import Policy
 from ray.tune.registry import register_env
+from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
+from ray.rllib.examples.policy.random_policy import RandomPolicy
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
-parser.add_argument("--num-iters", type=int, default=20)
-
-
-class RandomPolicy(Policy):
-    """Hand-coded policy that returns random actions."""
-
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches=None,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        """Compute actions on a batch of observations."""
-        return [self.action_space.sample() for _ in obs_batch], [], {}
-
-    def learn_on_batch(self, samples):
-        """No learning."""
-        return {}
-
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=20)
+parser.add_argument("--stop-reward", type=float, default=150)
+parser.add_argument("--stop-timesteps", type=int, default=100000)

 if __name__ == "__main__":
    args = parser.parse_args()
@@ -56,18 +41,32 @@ if __name__ == "__main__":
    obs_space = single_env.observation_space
    act_space = single_env.action_space

-    tune.run(
+    stop = {
+        "training_iteration": args.stop_iters,
+        "episode_reward_mean": args.stop_reward,
+        "timesteps_total": args.stop_timesteps,
+    }
+
+    results = tune.run(
        "PG",
-        stop={"training_iteration": args.num_iters},
+        stop=stop,
        config={
            "env": "multi_agent_cartpole",
            "multiagent": {
                "policies": {
-                    "pg_policy": (None, obs_space, act_space, {}),
+                    "pg_policy": (None, obs_space, act_space, {
+                        "use_pytorch": args.torch
+                    }),
                    "random": (RandomPolicy, obs_space, act_space, {}),
                },
                "policy_mapping_fn": (
                    lambda agent_id: ["pg_policy", "random"][agent_id % 2]),
            },
+            "use_pytorch": args.torch,
        },
    )
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -12,19 +12,27 @@ import argparse
 import gym

 import ray
-from ray.rllib.agents.dqn.dqn import DQNTrainer
-from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
-from ray.rllib.agents.ppo.ppo import PPOTrainer
-from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
+from ray.rllib.agents.dqn import DQNTrainer, DQNTFPolicy, DQNTorchPolicy
+from ray.rllib.agents.ppo import PPOTrainer, PPOTFPolicy, PPOTorchPolicy
 from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
 from ray.tune.logger import pretty_print
 from ray.tune.registry import register_env

 parser = argparse.ArgumentParser()
-parser.add_argument("--num-iters", type=int, default=20)
+# Use torch for both policies.
+parser.add_argument("--torch", action="store_true")
+# Mix PPO=tf and DQN=torch if set.
+parser.add_argument("--mixed-torch-tf", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=20)
+parser.add_argument("--stop-reward", type=float, default=50)
+parser.add_argument("--stop-timesteps", type=int, default=100000)

 if __name__ == "__main__":
    args = parser.parse_args()
+    assert not (args.torch and args.mixed_torch_tf),\
+        "Use either --torch or --mixed-torch-tf, not both!"
+
    ray.init()

    # Simple environment with 4 independent cartpole entities
@@ -37,8 +45,10 @@ if __name__ == "__main__":
    # You can also have multiple policies per trainer, but here we just
    # show one each for PPO and DQN.
    policies = {
-        "ppo_policy": (PPOTFPolicy, obs_space, act_space, {}),
-        "dqn_policy": (DQNTFPolicy, obs_space, act_space, {}),
+        "ppo_policy": (PPOTorchPolicy if args.torch else PPOTFPolicy,
+                       obs_space, act_space, {}),
+        "dqn_policy": (DQNTorchPolicy if args.torch or args.mixed_torch_tf else
+                       DQNTFPolicy, obs_space, act_space, {}),
    }

    def policy_mapping_fn(agent_id):
@@ -59,6 +69,7 @@ if __name__ == "__main__":
            # disable filters, otherwise we would need to synchronize those
            # as well to the DQN agent
            "observation_filter": "NoFilter",
+            "use_pytorch": args.torch,
        })

    dqn_trainer = DQNTrainer(
@@ -71,6 +82,7 @@ if __name__ == "__main__":
            },
            "gamma": 0.95,
            "n_step": 3,
+            "use_pytorch": args.torch or args.mixed_torch_tf,
        })

    # You should see both the printed X and Y approach 200 as this trains:
@@ -78,17 +90,31 @@ if __name__ == "__main__":
    #   policy_reward_mean:
    #     dqn_policy: X
    #     ppo_policy: Y
-    for i in range(args.num_iters):
+    for i in range(args.stop_iters):
        print("== Iteration", i, "==")

        # improve the DQN policy
        print("-- DQN --")
-        print(pretty_print(dqn_trainer.train()))
+        result_dqn = dqn_trainer.train()
+        print(pretty_print(result_dqn))

        # improve the PPO policy
        print("-- PPO --")
-        print(pretty_print(ppo_trainer.train()))
+        result_ppo = ppo_trainer.train()
+        print(pretty_print(result_ppo))
+
+        # Test passed gracefully.
+        if args.as_test and \
+                result_dqn["episode_reward_mean"] > args.stop_reward and \
+                result_ppo["episode_reward_mean"] > args.stop_reward:
+            print("test passed (both agents above requested reward)")
+            quit(0)

        # swap weights to synchronize
        dqn_trainer.set_weights(ppo_trainer.get_weights(["ppo_policy"]))
        ppo_trainer.set_weights(dqn_trainer.get_weights(["dqn_policy"]))
+
+    # Desired reward not reached.
+    if args.as_test:
+        raise ValueError("Desired reward ({}) not reached!".format(
+            args.stop_reward))
@@ -1,22 +1,20 @@
 import argparse
 from gym.spaces import Dict, Tuple, Box, Discrete
-import sys

 import ray
+import ray.tune as tune
 from ray.tune.registry import register_env
 from ray.rllib.examples.env.nested_space_repeat_after_me_env import \
    NestedSpaceRepeatAfterMeEnv
-from ray.rllib.utils import try_import_tree
-from ray.rllib.utils.framework import try_import_tf
-
-tf = try_import_tf()
-tree = try_import_tree()
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
 parser.add_argument("--run", type=str, default="PPO")
 parser.add_argument("--torch", action="store_true")
-parser.add_argument("--stop", type=int, default=90)
-parser.add_argument("--max-trainstop", type=int, default=90)
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-reward", type=float, default=0.0)
+parser.add_argument("--stop-iters", type=int, default=100)
+parser.add_argument("--stop-timesteps", type=int, default=100000)
 parser.add_argument("--num-cpus", type=int, default=0)

 if __name__ == "__main__":
@@ -40,19 +38,23 @@ if __name__ == "__main__":
        },
        "entropy_coeff": 0.00005,  # We don't want high entropy in this Env.
        "gamma": 0.0,  # No history in Env (bandit problem).
-        "lr": 0.0003,
+        "lr": 0.0005,
        "num_envs_per_worker": 20,
-        "num_sgd_iter": 20,
+        "num_sgd_iter": 4,
        "num_workers": 0,
-        "use_pytorch": args.torch,
        "vf_loss_coeff": 0.01,
+        "use_pytorch": args.torch,
    }

-    import ray.rllib.agents.ppo as ppo
-    trainer = ppo.PPOTrainer(config=config)
-    for _ in range(100):
-        results = trainer.train()
-        print(results)
-        if results["episode_reward_mean"] > args.stop:
-            sys.exit(0)  # Learnt, exit gracefully.
-    sys.exit(1)  # Done, but did not learn, exit with error.
+    stop = {
+        "training_iteration": args.stop_iters,
+        "episode_reward_mean": args.stop_reward,
+        "timesteps_total": args.stop_timesteps,
+    }
+
+    results = tune.run(args.run, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -15,83 +15,34 @@ Working configurations are given below.
 """

 import argparse
-from gym.spaces import Box

 import ray
 from ray import tune
-from ray.rllib.agents.dqn.distributional_q_tf_model import \
-    DistributionalQTFModel
+from ray.tune.registry import register_env
 from ray.rllib.examples.env.parametric_actions_cartpole import \
    ParametricActionsCartPole
+from ray.rllib.examples.models.parametric_actions_model import \
+    ParametricActionsModel, TorchParametricActionsModel
 from ray.rllib.models import ModelCatalog
-from ray.rllib.models.tf.fcnet_v2 import FullyConnectedNetwork
-from ray.rllib.models.tf.tf_modelv2 import TFModelV2
-from ray.tune.registry import register_env
-from ray.rllib.utils import try_import_tf
-
-tf = try_import_tf()
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
-parser.add_argument("--stop", type=int, default=200)
 parser.add_argument("--run", type=str, default="PPO")
-
-
-class ParametricActionsModel(DistributionalQTFModel, TFModelV2):
-    """Parametric action model that handles the dot product and masking.
-
-    This assumes the outputs are logits for a single Categorical action dist.
-    Getting this to work with a more complex output (e.g., if the action space
-    is a tuple of several distributions) is also possible but left as an
-    exercise to the reader.
-    """
-
-    def __init__(self,
-                 obs_space,
-                 action_space,
-                 num_outputs,
-                 model_config,
-                 name,
-                 true_obs_shape=(4, ),
-                 action_embed_size=2,
-                 **kw):
-        super(ParametricActionsModel, self).__init__(
-            obs_space, action_space, num_outputs, model_config, name, **kw)
-        self.action_embed_model = FullyConnectedNetwork(
-            Box(-1, 1, shape=true_obs_shape), action_space, action_embed_size,
-            model_config, name + "_action_embed")
-        self.register_variables(self.action_embed_model.variables())
-
-    def forward(self, input_dict, state, seq_lens):
-        # Extract the available actions tensor from the observation.
-        avail_actions = input_dict["obs"]["avail_actions"]
-        action_mask = input_dict["obs"]["action_mask"]
-
-        # Compute the predicted action embedding
-        action_embed, _ = self.action_embed_model({
-            "obs": input_dict["obs"]["cart"]
-        })
-
-        # Expand the model output to [BATCH, 1, EMBED_SIZE]. Note that the
-        # avail actions tensor is of shape [BATCH, MAX_ACTIONS, EMBED_SIZE].
-        intent_vector = tf.expand_dims(action_embed, 1)
-
-        # Batch dot product => shape of logits is [BATCH, MAX_ACTIONS].
-        action_logits = tf.reduce_sum(avail_actions * intent_vector, axis=2)
-
-        # Mask out invalid actions (use tf.float32.min for stability)
-        inf_mask = tf.maximum(tf.log(action_mask), tf.float32.min)
-        return action_logits + inf_mask, state
-
-    def value_function(self):
-        return self.action_embed_model.value_function()
-
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=200)
+parser.add_argument("--stop-reward", type=float, default=150.0)
+parser.add_argument("--stop-timesteps", type=int, default=100000)

 if __name__ == "__main__":
    args = parser.parse_args()
    ray.init()

-    ModelCatalog.register_custom_model("pa_model", ParametricActionsModel)
    register_env("pa_cartpole", lambda _: ParametricActionsCartPole(10))
+    ModelCatalog.register_custom_model(
+        "pa_model", TorchParametricActionsModel
+        if args.torch else ParametricActionsModel)
+
    if args.run == "DQN":
        cfg = {
            # TODO(ekl) we need to set these to prevent the masked values
@@ -103,16 +54,25 @@ if __name__ == "__main__":
        }
    else:
        cfg = {}
-    tune.run(
-        args.run,
-        stop={
-            "episode_reward_mean": args.stop,
+
+    config = dict({
+        "env": "pa_cartpole",
+        "model": {
+            "custom_model": "pa_model",
        },
-        config=dict({
-            "env": "pa_cartpole",
-            "model": {
-                "custom_model": "pa_model",
-            },
-            "num_workers": 0,
-        }, **cfg),
-    )
+        "num_workers": 0,
+        "use_pytorch": args.torch,
+    }, **cfg)
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(args.run, stop=stop, config=config)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -8,99 +8,41 @@ This demonstrates running the following policies in competition:
 """

 import argparse
-import random
 from gym.spaces import Discrete
+import random

 from ray import tune
-from ray.rllib.agents.pg.pg import PGTrainer
-from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
+from ray.rllib.agents.pg import PGTrainer, PGTFPolicy, PGTorchPolicy
+from ray.rllib.agents.registry import get_agent_class
 from ray.rllib.examples.env.rock_paper_scissors import RockPaperScissors
-from ray.rllib.policy.policy import Policy
-from ray.rllib.utils import try_import_tf
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--stop", type=int, default=1000)
+from ray.rllib.examples.policy.rock_paper_scissors_dummies import \
+    BeatLastHeuristic, AlwaysSameHeuristic
+from ray.rllib.utils.framework import try_import_tf, try_import_torch
+from ray.rllib.utils.test_utils import check_learning_achieved

 tf = try_import_tf()
+torch, _ = try_import_torch()
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=150)
+parser.add_argument("--stop-reward", type=float, default=1000.0)
+parser.add_argument("--stop-timesteps", type=int, default=100000)


-class AlwaysSameHeuristic(Policy):
-    """Pick a random move and stick with it for the entire episode."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.exploration = self._create_exploration()
-
-    def get_initial_state(self):
-        return [
-            random.choice([
-                RockPaperScissors.ROCK, RockPaperScissors.PAPER,
-                RockPaperScissors.SCISSORS
-            ])
-        ]
-
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches=None,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        return state_batches[0], state_batches, {}
-
-    def learn_on_batch(self, samples):
-        pass
-
-    def get_weights(self):
-        pass
-
-    def set_weights(self, weights):
-        pass
-
-
-class BeatLastHeuristic(Policy):
-    """Play the move that would beat the last move of the opponent."""
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.exploration = self._create_exploration()
-
-    def compute_actions(self,
-                        obs_batch,
-                        state_batches=None,
-                        prev_action_batch=None,
-                        prev_reward_batch=None,
-                        info_batch=None,
-                        episodes=None,
-                        **kwargs):
-        def successor(x):
-            if x[RockPaperScissors.ROCK] == 1:
-                return RockPaperScissors.PAPER
-            elif x[RockPaperScissors.PAPER] == 1:
-                return RockPaperScissors.SCISSORS
-            elif x[RockPaperScissors.SCISSORS] == 1:
-                return RockPaperScissors.ROCK
-
-        return [successor(x) for x in obs_batch], [], {}
-
-    def learn_on_batch(self, samples):
-        pass
-
-    def get_weights(self):
-        pass
-
-    def set_weights(self, weights):
-        pass
-
-
-def run_same_policy(args):
+def run_same_policy(args, stop):
    """Use the same policy for both agents (trivial case)."""
+    config = {
+        "env": RockPaperScissors,
+        "use_pytorch": args.torch,
+    }

-    tune.run(
-        "PG",
-        stop={"timesteps_total": args.stop},
-        config={"env": RockPaperScissors})
+    results = tune.run("PG", config=config, stop=stop)
+
+    if args.as_test:
+        # Check vs 0.0 as we are playing a zero-sum game.
+        check_learning_achieved(results, 0.0)


 def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
@@ -134,16 +76,35 @@ def run_heuristic_vs_learned(args, use_lstm=False, trainer="PG"):
                "learned": (None, Discrete(3), Discrete(3), {
                    "model": {
                        "use_lstm": use_lstm
-                    }
+                    },
+                    "use_pytorch": args.torch,
                }),
            },
            "policy_mapping_fn": select_policy,
        },
+        "use_pytorch": args.torch,
    }
-    tune.run(trainer, stop={"timesteps_total": args.stop}, config=config)
+    cls = get_agent_class(trainer) if isinstance(trainer, str) else trainer
+    trainer_obj = cls(config=config)
+    env = trainer_obj.workers.local_worker().env
+    for _ in range(args.stop_iters):
+        results = trainer_obj.train()
+        print(results)
+        # Timesteps reached.
+        if results["timesteps_total"] > args.stop_timesteps:
+            break
+        # Reward (difference) reached -> all good, return.
+        elif env.player1_score - env.player2_score > args.stop_reward:
+            return
+
+    # Reward (difference) not reached: Error if `as_test`.
+    if args.as_test:
+        raise ValueError(
+            "Desired reward difference ({}) not reached! Only got to {}.".
+            format(args.stop_reward, env.player1_score - env.player2_score))


-def run_with_custom_entropy_loss(args):
+def run_with_custom_entropy_loss(args, stop):
    """Example of customizing the loss function of an existing policy.

    This performs about the same as the default loss does."""
@@ -151,24 +112,44 @@ def run_with_custom_entropy_loss(args):
    def entropy_policy_gradient_loss(policy, model, dist_class, train_batch):
        logits, _ = model.from_batch(train_batch)
        action_dist = dist_class(logits, model)
-        return (-0.1 * action_dist.entropy() - tf.reduce_mean(
-            action_dist.logp(train_batch["actions"]) *
-            train_batch["advantages"]))
+        if args.torch:
+            # required by PGTorchPolicy's stats fn.
+            policy.pi_err = torch.tensor([0.0])
+            return torch.mean(-0.1 * action_dist.entropy() -
+                              (action_dist.logp(train_batch["actions"]) *
+                               train_batch["advantages"]))
+        else:
+            return (-0.1 * action_dist.entropy() - tf.reduce_mean(
+                action_dist.logp(train_batch["actions"]) *
+                train_batch["advantages"]))

-    EntropyPolicy = PGTFPolicy.with_updates(
+    policy_cls = PGTorchPolicy if args.torch else PGTFPolicy
+    EntropyPolicy = policy_cls.with_updates(
        loss_fn=entropy_policy_gradient_loss)
+
    EntropyLossPG = PGTrainer.with_updates(
        name="EntropyPG", get_policy_class=lambda _: EntropyPolicy)
+
    run_heuristic_vs_learned(args, use_lstm=True, trainer=EntropyLossPG)


 if __name__ == "__main__":
    args = parser.parse_args()
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    run_same_policy(args, stop=stop)
+    print("run_same_policy: ok.")
+
    run_heuristic_vs_learned(args, use_lstm=False)
    print("run_heuristic_vs_learned(w/o lstm): ok.")
-    run_same_policy(args)
-    print("run_same_policy: ok.")
+
    run_heuristic_vs_learned(args, use_lstm=True)
    print("run_heuristic_vs_learned (w/ lstm): ok.")
-    run_with_custom_entropy_loss(args)
+
+    run_with_custom_entropy_loss(args, stop=stop)
    print("run_with_custom_entropy_loss: ok.")
@@ -13,8 +13,8 @@ import ray
 from ray import tune
 from ray.rllib.evaluation import RolloutWorker
 from ray.rllib.evaluation.metrics import collect_metrics
+from ray.rllib.policy.policy import Policy
 from ray.rllib.policy.sample_batch import SampleBatch
-from ray.rllib.policy.tests.test_policy import TestPolicy

 parser = argparse.ArgumentParser()
 parser.add_argument("--gpu", action="store_true")
@@ -23,7 +23,7 @@ parser.add_argument("--num-workers", type=int, default=2)
 parser.add_argument("--num-cpus", type=int, default=0)


-class CustomPolicy(TestPolicy):
+class CustomPolicy(Policy):
    """Example of a custom policy written from scratch.

    You might find it more convenient to extend TF/TorchPolicy instead
@@ -13,8 +13,10 @@ from ray import tune
 from ray.rllib.agents.trainer_template import build_trainer
 from ray.rllib.agents.dqn.dqn import DEFAULT_CONFIG as DQN_CONFIG
 from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
+from ray.rllib.agents.dqn.dqn_torch_policy import DQNTorchPolicy
 from ray.rllib.agents.ppo.ppo import DEFAULT_CONFIG as PPO_CONFIG
 from ray.rllib.agents.ppo.ppo_tf_policy import PPOTFPolicy
+from ray.rllib.agents.ppo.ppo_torch_policy import PPOTorchPolicy
 from ray.rllib.evaluation.worker_set import WorkerSet
 from ray.rllib.execution.common import _get_shared_metrics
 from ray.rllib.execution.concurrency_ops import Concurrently
@@ -25,10 +27,16 @@ from ray.rllib.execution.replay_ops import StoreToReplayBuffer, Replay
 from ray.rllib.execution.train_ops import TrainOneStep, UpdateTargetNetwork
 from ray.rllib.examples.env.multi_agent import MultiAgentCartPole
 from ray.rllib.optimizers.async_replay_optimizer import LocalReplayBuffer
+from ray.rllib.utils.test_utils import check_learning_achieved
 from ray.tune.registry import register_env

 parser = argparse.ArgumentParser()
-parser.add_argument("--num-iters", type=int, default=20)
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--mixed-torch-tf", action="store_true")
+parser.add_argument("--stop-iters", type=int, default=20)
+parser.add_argument("--stop-reward", type=float, default=150.0)
+parser.add_argument("--stop-timesteps", type=int, default=100000)


 def custom_training_workflow(workers: WorkerSet, config: dict):
@@ -90,6 +98,9 @@ def custom_training_workflow(workers: WorkerSet, config: dict):

 if __name__ == "__main__":
    args = parser.parse_args()
+    assert not (args.torch and args.mixed_torch_tf),\
+        "Use either --torch or --mixed-torch-tf, not both!"
+
    ray.init()

    # Simple environment with 4 independent cartpole entities
@@ -102,8 +113,10 @@ if __name__ == "__main__":
    # Note that since the trainer below does not include a default policy or
    # policy configs, we have to explicitly set it in the multiagent config:
    policies = {
-        "ppo_policy": (PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
-        "dqn_policy": (DQNTFPolicy, obs_space, act_space, DQN_CONFIG),
+        "ppo_policy": (PPOTorchPolicy if args.torch or args.mixed_torch_tf else
+                       PPOTFPolicy, obs_space, act_space, PPO_CONFIG),
+        "dqn_policy": (DQNTorchPolicy if args.torch else DQNTFPolicy,
+                       obs_space, act_space, DQN_CONFIG),
    }

    def policy_mapping_fn(agent_id):
@@ -117,16 +130,27 @@ if __name__ == "__main__":
        default_policy=None,
        execution_plan=custom_training_workflow)

-    tune.run(
-        MyTrainer,
-        stop={"training_iteration": args.num_iters},
-        config={
-            "rollout_fragment_length": 50,
-            "num_workers": 0,
-            "env": "multi_agent_cartpole",
-            "multiagent": {
-                "policies": policies,
-                "policy_mapping_fn": policy_mapping_fn,
-                "policies_to_train": ["dqn_policy", "ppo_policy"],
-            },
-        })
+    config = {
+        "rollout_fragment_length": 50,
+        "num_workers": 0,
+        "env": "multi_agent_cartpole",
+        "multiagent": {
+            "policies": policies,
+            "policy_mapping_fn": policy_mapping_fn,
+            "policies_to_train": ["dqn_policy", "ppo_policy"],
+        },
+        "use_pytorch": args.torch,
+    }
+
+    stop = {
+        "training_iteration": args.stop_iters,
+        "timesteps_total": args.stop_timesteps,
+        "episode_reward_mean": args.stop_reward,
+    }
+
+    results = tune.run(MyTrainer, config=config, stop=stop)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -17,11 +17,15 @@ from ray import tune
 from ray.tune import register_env, grid_search
 from ray.rllib.env.multi_agent_env import ENV_STATE
 from ray.rllib.examples.env.two_step_game import TwoStepGame
+from ray.rllib.utils.test_utils import check_learning_achieved

 parser = argparse.ArgumentParser()
-parser.add_argument("--stop", type=int, default=50000)
 parser.add_argument("--run", type=str, default="PG")
 parser.add_argument("--num-cpus", type=int, default=0)
+parser.add_argument("--as-test", action="store_true")
+parser.add_argument("--torch", action="store_true")
+parser.add_argument("--stop-reward", type=float, default=7.0)
+parser.add_argument("--stop-timesteps", type=int, default=50000)

 if __name__ == "__main__":
    args = parser.parse_args()
@@ -73,6 +77,7 @@ if __name__ == "__main__":
                },
                "policy_mapping_fn": lambda x: "pol1" if x == 0 else "pol2",
            },
+            "use_pytorch": args.torch,
        }
        group = False
    elif args.run == "QMIX":
@@ -87,6 +92,7 @@ if __name__ == "__main__":
                "separate_state_space": True,
                "one_hot_state_encoding": True
            },
+            "use_pytorch": args.torch,
        }
        group = True
    elif args.run == "APEX_QMIX":
@@ -107,6 +113,7 @@ if __name__ == "__main__":
                "separate_state_space": True,
                "one_hot_state_encoding": True
            },
+            "use_pytorch": args.torch,
        }
        group = True
    else:
@@ -114,12 +121,19 @@ if __name__ == "__main__":
        group = False

    ray.init(num_cpus=args.num_cpus or None)
-    tune.run(
-        args.run,
-        stop={
-            "timesteps_total": args.stop,
-        },
-        config=dict(config, **{
-            "env": "grouped_twostep" if group else TwoStepGame,
-        }),
-    )
+
+    stop = {
+        "episode_reward_mean": args.stop_reward,
+        "timesteps_total": args.stop_timesteps,
+    }
+
+    config = dict(config, **{
+        "env": "grouped_twostep" if group else TwoStepGame,
+    })
+
+    results = tune.run(args.run, stop=stop, config=config)
+
+    if args.as_test:
+        check_learning_achieved(results, args.stop_reward)
+
+    ray.shutdown()
@@ -7,16 +7,16 @@ from ray.tune.registry import register_env
 from ray.rllib.agents.pg import PGTrainer
 from ray.rllib.agents.pg.pg_tf_policy import PGTFPolicy
 from ray.rllib.agents.dqn.dqn_tf_policy import DQNTFPolicy
+from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv
+from ray.rllib.evaluation.rollout_worker import RolloutWorker
+from ray.rllib.evaluation.metrics import collect_metrics
+from ray.rllib.evaluation.worker_set import WorkerSet
 from ray.rllib.examples.env.multi_agent import MultiAgentCartPole, \
    BasicMultiAgent, EarlyDoneMultiAgent, RoundRobinMultiAgent
+from ray.rllib.examples.policy.random_policy import RandomPolicy
 from ray.rllib.optimizers import (SyncSamplesOptimizer, SyncReplayOptimizer,
                                  AsyncGradientsOptimizer)
 from ray.rllib.tests.test_rollout_worker import MockPolicy
-from ray.rllib.evaluation.rollout_worker import RolloutWorker
-from ray.rllib.policy.tests.test_policy import TestPolicy
-from ray.rllib.evaluation.metrics import collect_metrics
-from ray.rllib.evaluation.worker_set import WorkerSet
-from ray.rllib.env.base_env import _MultiAgentEnvToBaseEnv


 def one_hot(i, n):
@@ -297,7 +297,7 @@ class TestMultiAgentEnv(unittest.TestCase):
    def test_custom_rnn_state_values(self):
        h = {"some": {"arbitrary": "structure", "here": [1, 2, 3]}}

-        class StatefulPolicy(TestPolicy):
+        class StatefulPolicy(RandomPolicy):
            def compute_actions(self,
                                obs_batch,
                                state_batches=None,
@@ -13,13 +13,13 @@ from ray.rllib.env.vector_env import VectorEnv
 from ray.rllib.evaluation.rollout_worker import RolloutWorker
 from ray.rllib.evaluation.metrics import collect_metrics
 from ray.rllib.evaluation.postprocessing import compute_advantages
-from ray.rllib.policy.tests.test_policy import TestPolicy
+from ray.rllib.examples.policy.random_policy import RandomPolicy
 from ray.rllib.policy.sample_batch import DEFAULT_POLICY_ID, SampleBatch
 from ray.rllib.utils.test_utils import check
 from ray.tune.registry import register_env


-class MockPolicy(TestPolicy):
+class MockPolicy(RandomPolicy):
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,
@@ -40,7 +40,7 @@ class MockPolicy(TestPolicy):
            batch, 100.0, 0.9, use_gae=False, use_critic=False)


-class BadPolicy(MockPolicy):
+class BadPolicy(RandomPolicy):
    def compute_actions(self,
                        obs_batch,
                        state_batches=None,