ci: Redo format.sh --all script & backfill lint fixes (#9956)

2026-06-27 17:49:47 +08:00 · 2020-08-07 16:49:49 -07:00
parent 1d01c668f0
commit 8e76796fd0
147 changed files with 702 additions and 636 deletions
@@ -1,6 +1,5 @@
 import ray

-
 ray.init()


@@ -1,6 +1,5 @@
 import ray

-
 ray.init()


@@ -313,7 +313,7 @@ lint_readme() {
 }

 lint_scripts() {
-  "${ROOT_DIR}"/format.sh --all
+  FORMAT_SH_PRINT_DIFF=1 "${ROOT_DIR}"/format.sh --all
 }

 lint_bazel() {
@@ -46,11 +46,6 @@ builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
 ROOT="$(git rev-parse --show-toplevel)"
 builtin cd "$ROOT" || exit 1

-# Add the upstream remote if it doesn't exist
-if ! git remote -v | grep -q upstream; then
-    git remote add 'upstream' 'https://github.com/ray-project/ray.git'
-fi
-
 FLAKE8_VERSION=$(flake8 --version | awk '{print $1}')
 YAPF_VERSION=$(yapf --version | awk '{print $2}')
 SHELLCHECK_VERSION=$(shellcheck --version | awk '/^version:/ {print $2}')
@@ -73,9 +68,6 @@ else
    echo "WARNING: clang-format is not installed!"
 fi

-# Only fetch master since that's the branch we're diffing against.
-git fetch upstream master || true
-
 SHELLCHECK_FLAGS=(
  --exclude=1090  # "Can't follow non-constant source. Use a directive to specify location."
  --exclude=1091  # "Not following {file} due to some error"
@@ -99,7 +91,16 @@ YAPF_EXCLUDES=(
    '--exclude' 'python/ray/thirdparty_files/*'
 )

-FLAKE8_EXCLUDES="python/ray/core/generated/,streaming/python/generated,doc/source/conf.py,python/ray/cloudpickle/,python/ray/thirdparty_files/"
+GIT_LS_EXCLUDES=(
+  ':(exclude)python/ray/cloudpickle/'
+)
+
+# TODO(barakmich): This should be cleaned up. I've at least excised the copies
+# of these arguments to this location, but the long-term answer is to actually
+# make a flake8 config file
+FLAKE8_EXCLUDE="--exclude=python/ray/core/generated/,streaming/python/generated,doc/source/conf.py,python/ray/cloudpickle/,python/ray/thirdparty_files/,python/build/,python/.eggs/"
+FLAKE8_IGNORES="--ignore=C408,E121,E123,E126,E226,E24,E704,W503,W504,W605"
+FLAKE8_PYX_IGNORES="--ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605"

 shellcheck_scripts() {
  shellcheck "${SHELLCHECK_FLAGS[@]}" "$@"
@@ -110,7 +111,7 @@ shellcheck_bazel() {
 }

 # Format specified files
-format() {
+format_files() {
    local shell_files=() python_files=() bazel_files=()

    local name
@@ -162,6 +163,46 @@ format() {
    fi
 }

+# Format all files, and print the diff to stdout for travis.
+format_all() {
+    command -v flake8 &> /dev/null;
+    HAS_FLAKE8=$?
+
+    echo "$(date)" "YAPF...."
+    git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 10 \
+      yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
+    if [ $HAS_FLAKE8 ]; then
+      echo "$(date)" "Flake8...."
+      git ls-files -- '*.py' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \
+        flake8 --inline-quotes '"' --no-avoid-escape  "$FLAKE8_EXCLUDE" "$FLAKE8_IGNORES"
+
+      git ls-files -- '*.pyx' '*.pxd' '*.pxi' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 \
+        flake8 --inline-quotes '"' --no-avoid-escape "$FLAKE8_EXCLUDE" "$FLAKE8_PYX_IGNORES"
+    fi
+
+    echo "$(date)" "clang-format...."
+    if command -v clang-format >/dev/null; then
+      git ls-files -- '*.cc' '*.h' "${GIT_LS_EXCLUDES[@]}" | xargs -P 5 clang-format -i
+    fi
+
+    if command -v shellcheck >/dev/null; then
+      echo "$(date)" "shellcheck bazel...."
+      shellcheck_bazel
+
+      local shell_files non_shell_files
+      non_shell_files=($(git ls-files -- ':(exclude)*.sh'))
+      shell_files=($(git ls-files -- '*.sh'))
+      if [ 0 -lt "${#non_shell_files[@]}" ]; then
+        shell_files+=($(git --no-pager grep -l -- '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' "${non_shell_files[@]}" || true))
+      fi
+      if [ 0 -lt "${#shell_files[@]}" ]; then
+        echo "$(date)" "shellcheck scripts...."
+        shellcheck_scripts "${shell_files[@]}"
+      fi
+    fi
+    echo "$(date)" "done!"
+}
+
 # Format files that differ from main branch. Ignores dirs that are not slated
 # for autoformat yet.
 format_changed() {
@@ -178,17 +219,14 @@ format_changed() {
             yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
        if which flake8 >/dev/null; then
            git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
-                 flake8 --inline-quotes '"' --no-avoid-escape --exclude="$FLAKE8_EXCLUDES,rllib/" --ignore=C408,E121,E123,E126,E226,E24,E704,W503,W504,W605
-            # Ignore F821 for rllib flake8 checking (produces errors for type annotations using quotes (non-imported classes)).
-            git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.py' | xargs -P 5 \
-                 flake8 --inline-quotes '"' --no-avoid-escape --exclude="$FLAKE8_EXCLUDES" --filename="rllib/" --ignore=C408,E121,E123,E126,E226,E24,E704,W503,W504,W605,F821
+                 flake8 --inline-quotes '"' --no-avoid-escape "$FLAKE8_EXCLUDE" "$FLAKE8_IGNORES"
        fi
    fi

    if ! git diff --diff-filter=ACRM --quiet --exit-code "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' &>/dev/null; then
        if which flake8 >/dev/null; then
            git diff --name-only --diff-filter=ACRM "$MERGEBASE" -- '*.pyx' '*.pxd' '*.pxi' | xargs -P 5 \
-                 flake8 --inline-quotes '"' --no-avoid-escape --exclude="$FLAKE8_EXCLUDES" --ignore=C408,E121,E123,E126,E211,E225,E226,E227,E24,E704,E999,W503,W504,W605
+                 flake8 --inline-quotes '"' --no-avoid-escape "$FLAKE8_EXCLUDE" "$FLAKE8_PYX_IGNORES"
        fi
    fi

@@ -216,35 +254,25 @@ format_changed() {
    fi
 }

-# Format all files, and print the diff to stdout for travis.
-format_all() {
-    # Ignore F821 for rllib flake8 checking (produces errors for type annotations using quotes (non-imported classes)).
-    flake8 --inline-quotes '"' --no-avoid-escape --exclude="$FLAKE8_EXCLUDES,rllib/" --ignore=C408,E121,E123,E126,E226,E24,E704,W503,W504,W605
-    flake8 --inline-quotes '"' --no-avoid-escape --exclude="$FLAKE8_EXCLUDES" --filename="rllib/" --ignore=C408,E121,E123,E126,E226,E24,E704,W503,W504,W605,F821
-
-    yapf --diff "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" test python
-
-    local shell_files
-    # shellcheck disable=SC2207
-    shell_files=($(
-      git -C "${ROOT}" ls-files --exclude-standard HEAD -- "*.sh" &&
-      { git -C "${ROOT}" --no-pager grep -l '^#!\(/usr\)\?/bin/\(env \+\)\?\(ba\)\?sh' ":(exclude)*.sh" || true; }
-    ))
-    if [ 0 -lt "${#shell_files[@]}" ]; then
-      shellcheck_scripts "${shell_files[@]}"
-    fi
-    shellcheck_bazel
-}

 # This flag formats individual files. --files *must* be the first command line
 # arg to use this option.
 if [ "${1-}" == '--files' ]; then
-    format "${@:2}"
+    format_files "${@:2}"
    # If `--all` is passed, then any further arguments are ignored and the
    # entire python directory is formatted.
 elif [ "${1-}" == '--all' ]; then
-    format_all
+    format_all "${@}"
+    if [ -n "${FORMAT_SH_PRINT_DIFF-}" ]; then git --no-pager diff; fi
 else
+    # Add the upstream remote if it doesn't exist
+    if ! git remote -v | grep -q upstream; then
+        git remote add 'upstream' 'https://github.com/ray-project/ray.git'
+    fi
+
+    # Only fetch master since that's the branch we're diffing against.
+    git fetch upstream master || true
+
    # Format only the files that changed in last commit.
    format_changed
 fi
@@ -1,13 +1,14 @@

 #pragma once

-#include <memory>
-
 #include <ray/api/generated/actor_funcs.generated.h>
 #include <ray/api/generated/create_funcs.generated.h>
 #include <ray/api/generated/funcs.generated.h>
 #include <ray/api/ray_runtime.h>
+
+#include <memory>
 #include <msgpack.hpp>
+
 #include "ray/core.h"
 namespace ray {
 namespace api {
@@ -232,13 +233,10 @@ inline ActorTaskCaller<ReturnType> Ray::CallActorInternal(FuncType &actor_func,
  return ActorTaskCaller<ReturnType>(runtime_, actor.ID(), ptr, buffer);
 }

-#include <ray/api/generated/exec_funcs.generated.h>
-
-#include <ray/api/generated/call_funcs_impl.generated.h>
-
-#include <ray/api/generated/create_actors_impl.generated.h>
-
 #include <ray/api/generated/call_actors_impl.generated.h>
+#include <ray/api/generated/call_funcs_impl.generated.h>
+#include <ray/api/generated/create_actors_impl.generated.h>
+#include <ray/api/generated/exec_funcs.generated.h>

 }  // namespace api
 }  // namespace ray
@@ -2,6 +2,7 @@
 #pragma once

 #include <ray/api/serializer.h>
+
 #include <msgpack.hpp>

 namespace ray {
@@ -2,9 +2,8 @@
 #pragma once

 #include <memory>
-#include <utility>
-
 #include <msgpack.hpp>
+#include <utility>

 #include "ray/core.h"

@@ -1,13 +1,14 @@

 #pragma once

+#include <ray/api/wait_result.h>
+
 #include <cstdint>
 #include <memory>
 #include <msgpack.hpp>
 #include <typeinfo>
 #include <vector>

-#include <ray/api/wait_result.h>
 #include "ray/core.h"

 namespace ray {
@@ -2,6 +2,7 @@
 #pragma once

 #include <ray/api/ray_exception.h>
+
 #include <msgpack.hpp>

 namespace ray {
@@ -2,6 +2,7 @@
 #pragma once

 #include <vector>
+
 #include "ray/core.h"

 namespace ray {
@@ -1,7 +1,7 @@

 #include <ray/api.h>
-
 #include <ray/api/ray_config.h>
+
 #include "runtime/abstract_ray_runtime.h"

 namespace ray {
@@ -1,11 +1,12 @@

 #include "abstract_ray_runtime.h"

-#include <cassert>
-
 #include <ray/api.h>
 #include <ray/api/ray_config.h>
 #include <ray/api/ray_exception.h>
+
+#include <cassert>
+
 #include "../util/address_helper.h"
 #include "../util/process_helper.h"
 #include "local_mode_ray_runtime.h"
@@ -1,11 +1,12 @@

 #pragma once

-#include <mutex>
-
 #include <ray/api/ray_config.h>
 #include <ray/api/ray_runtime.h>
+
 #include <msgpack.hpp>
+#include <mutex>
+
 #include "./object/object_store.h"
 #include "./task/task_executor.h"
 #include "./task/task_submitter.h"
@@ -2,6 +2,7 @@
 #include "local_mode_ray_runtime.h"

 #include <ray/api.h>
+
 #include "../util/address_helper.h"
 #include "./object/local_mode_object_store.h"
 #include "./object/object_store.h"
@@ -2,6 +2,7 @@
 #pragma once

 #include <unordered_map>
+
 #include "abstract_ray_runtime.h"
 #include "ray/core.h"

@@ -2,6 +2,7 @@
 #include "native_ray_runtime.h"

 #include <ray/api.h>
+
 #include "../util/address_helper.h"
 #include "./object/native_object_store.h"
 #include "./object/object_store.h"
@@ -2,6 +2,7 @@
 #pragma once

 #include <unordered_map>
+
 #include "abstract_ray_runtime.h"
 #include "ray/core.h"

@@ -1,12 +1,14 @@

+#include "local_mode_object_store.h"
+
+#include <ray/api/ray_exception.h>
+
 #include <algorithm>
 #include <chrono>
 #include <list>
 #include <thread>

-#include <ray/api/ray_exception.h>
 #include "../abstract_ray_runtime.h"
-#include "local_mode_object_store.h"

 namespace ray {
 namespace api {
@@ -2,10 +2,10 @@
 #pragma once

 #include <unordered_map>
-#include "ray/core.h"

 #include "../local_mode_ray_runtime.h"
 #include "object_store.h"
+#include "ray/core.h"

 namespace ray {
 namespace api {
@@ -1,12 +1,14 @@

+#include "native_object_store.h"
+
+#include <ray/api/ray_exception.h>
+
 #include <algorithm>
 #include <chrono>
 #include <list>
 #include <thread>

-#include <ray/api/ray_exception.h>
 #include "../abstract_ray_runtime.h"
-#include "native_object_store.h"

 namespace ray {
 namespace api {
@@ -2,10 +2,10 @@
 #pragma once

 #include <unordered_map>
-#include "ray/core.h"

 #include "../native_ray_runtime.h"
 #include "object_store.h"
+#include "ray/core.h"

 namespace ray {
 namespace api {
@@ -1,9 +1,9 @@

 #pragma once

-#include <memory>
-
 #include <ray/api/wait_result.h>
+
+#include <memory>
 #include <msgpack.hpp>

 namespace ray {
@@ -2,6 +2,7 @@
 #pragma once

 #include <msgpack.hpp>
+
 #include "ray/core.h"

 namespace ray {
@@ -1,11 +1,13 @@

+#include "local_mode_task_submitter.h"
+
+#include <ray/api/ray_exception.h>
+
 #include <boost/asio/post.hpp>
 #include <memory>

-#include <ray/api/ray_exception.h>
 #include "../../util/address_helper.h"
 #include "../abstract_ray_runtime.h"
-#include "local_mode_task_submitter.h"

 namespace ray {
 namespace api {
@@ -3,6 +3,7 @@
 #include <boost/asio/thread_pool.hpp>
 #include <memory>
 #include <queue>
+
 #include "../local_mode_ray_runtime.h"
 #include "absl/synchronization/mutex.h"
 #include "invocation_spec.h"
@@ -1,5 +1,7 @@
 #include "native_task_submitter.h"
+
 #include <ray/api/ray_exception.h>
+
 #include "../../util/address_helper.h"
 #include "../abstract_ray_runtime.h"

@@ -3,6 +3,7 @@
 #include <boost/asio/thread_pool.hpp>
 #include <memory>
 #include <queue>
+
 #include "../native_ray_runtime.h"
 #include "invocation_spec.h"
 #include "ray/core.h"
@@ -1,9 +1,10 @@

+#include "task_executor.h"
+
 #include <memory>

 #include "../../util/address_helper.h"
 #include "../abstract_ray_runtime.h"
-#include "task_executor.h"

 namespace ray {
 namespace api {
@@ -1,6 +1,7 @@
 #pragma once

 #include <memory>
+
 #include "absl/synchronization/mutex.h"
 #include "invocation_spec.h"
 #include "ray/core.h"
@@ -1,8 +1,9 @@
 #pragma once

+#include <ray/api/ray_runtime.h>
+
 #include <memory>

-#include <ray/api/ray_runtime.h>
 #include "invocation_spec.h"

 namespace ray {
@@ -1,6 +1,7 @@

 #include <gtest/gtest.h>
 #include <ray/api.h>
+
 #include <future>
 #include <thread>

@@ -1,6 +1,7 @@

 #include <gtest/gtest.h>
 #include <ray/api.h>
+
 #include <chrono>
 #include <thread>

@@ -9,7 +9,6 @@ import ray

@ray.remote
 class NewsServer(object):
-
    def __init__(self):
        self.conn = sqlite3.connect("newsreader.db")
        c = self.conn.cursor()
@@ -25,29 +24,36 @@ class NewsServer(object):
        items = []
        c = self.conn.cursor()
        for item in feed.items:
-            items.append({"title": item.title,
-                          "link": item.link,
-                          "description": item.description,
-                          "description_text": item.description,
-                          "pubDate": str(item.pub_date)})
-            c.execute("""INSERT INTO news (title, link, description,
+            items.append({
+                "title": item.title,
+                "link": item.link,
+                "description": item.description,
+                "description_text": item.description,
+                "pubDate": str(item.pub_date)
+            })
+            c.execute(
+                """INSERT INTO news (title, link, description,
                         published, feed, liked) values
-                         (?, ?, ?, ?, ?, ?)""", (
-                         item.title, item.link, item.description,
-                         item.pub_date, feed.link, False))
+                         (?, ?, ?, ?, ?, ?)""",
+                (item.title, item.link, item.description, item.pub_date,
+                 feed.link, False))
        self.conn.commit()

-        return {"channel": {"title": feed.title,
-                            "link": feed.link,
-                            "url": feed.link},
-                "items": items}
+        return {
+            "channel": {
+                "title": feed.title,
+                "link": feed.link,
+                "url": feed.link
+            },
+            "items": items
+        }

    def like_item(self, url, is_faved):
        c = self.conn.cursor()
        if is_faved:
-            c.execute("UPDATE news SET liked = 1 WHERE link = ?", (url,))
+            c.execute("UPDATE news SET liked = 1 WHERE link = ?", (url, ))
        else:
-            c.execute("UPDATE news SET liked = 0 WHERE link = ?", (url,))
+            c.execute("UPDATE news SET liked = 0 WHERE link = ?", (url, ))
        self.conn.commit()


@@ -71,8 +77,9 @@ def dispatcher():
        result = ray.get(method.remote(*method_args))
        return jsonify(result)
    else:
-        return jsonify(
-            {"error": "method_name '" + method_name + "' not found"})
+        return jsonify({
+            "error": "method_name '" + method_name + "' not found"
+        })


 if __name__ == "__main__":
@@ -7,10 +7,13 @@ import ray
 import wikipedia

 parser = argparse.ArgumentParser()
-parser.add_argument("--num-mappers",
-                    help="number of mapper actors used", default=3, type=int)
-parser.add_argument("--num-reducers",
-                    help="number of reducer actors used", default=4, type=int)
+parser.add_argument(
+    "--num-mappers", help="number of mapper actors used", default=3, type=int)
+parser.add_argument(
+    "--num-reducers",
+    help="number of reducer actors used",
+    default=4,
+    type=int)


@ray.remote
@@ -47,8 +50,10 @@ class Reducer(object):
        word_count_sum = defaultdict(lambda: 0)
        # Get the word counts for this Reducer's keys from all of the Mappers
        # and aggregate the results.
-        count_ids = [mapper.get_range.remote(article_index, self.keys)
-                     for mapper in self.mappers]
+        count_ids = [
+            mapper.get_range.remote(article_index, self.keys)
+            for mapper in self.mappers
+        ]
        # TODO(rkn): We should process these out of order using ray.wait.
        for count_id in count_ids:
            for k, v in ray.get(count_id):
@@ -78,8 +83,9 @@ if __name__ == "__main__":
            streams.append(Stream([line.strip() for line in f.readlines()]))

    # Partition the keys among the reducers.
-    chunks = np.array_split([chr(i) for i in range(ord("a"), ord("z") + 1)],
-                            args.num_reducers)
+    chunks = np.array_split([chr(i)
+                             for i in range(ord("a"),
+                                            ord("z") + 1)], args.num_reducers)
    keys = [[chunk[0], chunk[-1]] for chunk in chunks]

    # Create a number of mappers.
@@ -93,12 +99,14 @@ if __name__ == "__main__":
    while True:
        print("article index = {}".format(article_index))
        wordcounts = {}
-        counts = ray.get([reducer.next_reduce_result.remote(article_index)
-                          for reducer in reducers])
+        counts = ray.get([
+            reducer.next_reduce_result.remote(article_index)
+            for reducer in reducers
+        ])
        for count in counts:
            wordcounts.update(count)
-        most_frequent_words = heapq.nlargest(10, wordcounts,
-                                             key=wordcounts.get)
+        most_frequent_words = heapq.nlargest(
+            10, wordcounts, key=wordcounts.get)
        for word in most_frequent_words:
            print("  ", word, wordcounts[word])
        article_index += 1
@@ -68,10 +68,7 @@ parameter_grid = {"alpha": [1e-4, 1e-1, 1], "epsilon": [0.01, 0.1]}
 # As you can see, the setup here is exactly how you would do it for Scikit-Learn. Now, let's try fitting a model.

 tune_search = TuneGridSearchCV(
-    SGDClassifier(),
-    parameter_grid,
-    early_stopping=True,
-    max_iters=10)
+    SGDClassifier(), parameter_grid, early_stopping=True, max_iters=10)

 import time  # Just to compare fit times
 start = time.time()
@@ -11,13 +11,13 @@ cdef extern from "opencensus/tags/tag_key.h" nogil:
 cdef extern from "ray/stats/metric.h" nogil:
    cdef cppclass CMetric "ray::stats::Metric":
        CMetric(const c_string &name,
-               const c_string &description,
-               const c_string &unit,
-               const c_vector[CTagKey] &tag_keys)
+                const c_string &description,
+                const c_string &unit,
+                const c_vector[CTagKey] &tag_keys)
        c_string GetName() const
        void Record(double value)
-        void Record(double value, 
-                unordered_map[c_string, c_string] &tags)
+        void Record(double value,
+                    unordered_map[c_string, c_string] &tags)

    cdef cppclass CGauge "ray::stats::Gauge":
        CGauge(const c_string &name,
@@ -42,4 +42,4 @@ cdef extern from "ray/stats/metric.h" nogil:
                   const c_string &description,
                   const c_string &unit,
                   const c_vector[double] &boundaries,
-                   const c_vector[CTagKey] &tag_keys)
+                   const c_vector[CTagKey] &tag_keys)
@@ -126,9 +126,8 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
        target_noise_clip = policy.config["target_noise_clip"]
        clipped_normal_sample = tf.clip_by_value(
            tf.random.normal(
-                tf.shape(policy_tp1),
-                stddev=policy.config["target_noise"]), -target_noise_clip,
-            target_noise_clip)
+                tf.shape(policy_tp1), stddev=policy.config["target_noise"]),
+            -target_noise_clip, target_noise_clip)
        policy_tp1_smoothed = tf.clip_by_value(
            policy_tp1 + clipped_normal_sample,
            policy.action_space.low * tf.ones_like(policy_tp1),
@@ -146,8 +145,8 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
    q_t_det_policy = model.get_q_values(model_out_t, policy_t)

    if twin_q:
-        twin_q_t = model.get_twin_q_values(
-            model_out_t, train_batch[SampleBatch.ACTIONS])
+        twin_q_t = model.get_twin_q_values(model_out_t,
+                                           train_batch[SampleBatch.ACTIONS])

    # Target q-net(s) evaluation.
    q_tp1 = policy.target_model.get_q_values(target_model_out_tp1,
@@ -278,11 +277,11 @@ def gradients_fn(policy, optimizer, loss):
    if policy.config["framework"] in ["tf2", "tfe"]:
        tape = optimizer.tape
        pol_weights = policy.model.policy_variables()
-        actor_grads_and_vars = list(zip(tape.gradient(
-            policy.actor_loss, pol_weights), pol_weights))
+        actor_grads_and_vars = list(
+            zip(tape.gradient(policy.actor_loss, pol_weights), pol_weights))
        q_weights = policy.model.q_variables()
-        critic_grads_and_vars = list(zip(tape.gradient(
-            policy.critic_loss, q_weights), q_weights))
+        critic_grads_and_vars = list(
+            zip(tape.gradient(policy.critic_loss, q_weights), q_weights))
    else:
        actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
            policy.actor_loss, var_list=policy.model.policy_variables())
@@ -296,10 +295,12 @@ def gradients_fn(policy, optimizer, loss):
        clip_func = tf.identity

    # Save grads and vars for later use in `build_apply_op`.
-    policy._actor_grads_and_vars = [
-        (clip_func(g), v) for (g, v) in actor_grads_and_vars if g is not None]
-    policy._critic_grads_and_vars = [
-        (clip_func(g), v) for (g, v) in critic_grads_and_vars if g is not None]
+    policy._actor_grads_and_vars = [(clip_func(g), v)
+                                    for (g, v) in actor_grads_and_vars
+                                    if g is not None]
+    policy._critic_grads_and_vars = [(clip_func(g), v)
+                                     for (g, v) in critic_grads_and_vars
+                                     if g is not None]

    grads_and_vars = policy._actor_grads_and_vars + \
        policy._critic_grads_and_vars
@@ -65,8 +65,7 @@ def ddpg_actor_critic_loss(policy, model, _, train_batch):
            torch.normal(
                mean=torch.zeros(policy_tp1.size()),
                std=policy.config["target_noise"]).to(policy_tp1.device),
-            -target_noise_clip,
-            target_noise_clip)
+            -target_noise_clip, target_noise_clip)

        policy_tp1_smoothed = torch.min(
            torch.max(
@@ -405,7 +405,9 @@ class TestDDPG(unittest.TestCase):
        policy_t = sigmoid(2.0 * fc(
            relu(
                fc(model_out_t, weights[ks[1]], weights[ks[0]], framework=fw)),
-            weights[ks[5]], weights[ks[4]], framework=fw))
+            weights[ks[5]],
+            weights[ks[4]],
+            framework=fw))
        # Get policy output for t+1 (target model).
        policy_tp1 = sigmoid(2.0 * fc(
            relu(
@@ -413,7 +415,9 @@ class TestDDPG(unittest.TestCase):
                   weights[ks[3]],
                   weights[ks[2]],
                   framework=fw)),
-            weights[ks[7]], weights[ks[6]], framework=fw))
+            weights[ks[7]],
+            weights[ks[6]],
+            framework=fw))
        # Assume no smooth target policy.
        policy_tp1_smoothed = policy_tp1

@@ -74,8 +74,7 @@ class DistributionalQTFModel(TFModelV2):
                for i in range(len(q_hiddens)):
                    if use_noisy:
                        action_out = NoisyLayer(
-                            "{}hidden_{}".format(prefix, i),
-                            q_hiddens[i],
+                            "{}hidden_{}".format(prefix, i), q_hiddens[i],
                            sigma0)(action_out)
                    elif add_layer_norm:
                        action_out = tf.keras.layers.Dense(
@@ -135,8 +134,7 @@ class DistributionalQTFModel(TFModelV2):
            for i in range(len(q_hiddens)):
                if use_noisy:
                    state_out = NoisyLayer(
-                        "{}dueling_hidden_{}".format(prefix, i),
-                        q_hiddens[i],
+                        "{}dueling_hidden_{}".format(prefix, i), q_hiddens[i],
                        sigma0)(state_out)
                else:
                    state_out = tf.keras.layers.Dense(
@@ -160,8 +158,8 @@ class DistributionalQTFModel(TFModelV2):
        self.register_variables(self.q_value_head.variables)

        if dueling:
-            state_out = build_state_score(
-                name + "/state_value/", self.model_out)
+            state_out = build_state_score(name + "/state_value/",
+                                          self.model_out)
            self.state_value_head = tf.keras.Model(self.model_out, state_out)
            self.register_variables(self.state_value_head.variables)

@@ -231,8 +231,8 @@ def build_q_losses(policy, model, _, train_batch):
                train_batch[SampleBatch.NEXT_OBS],
                explore=False)
        q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1)
-        q_tp1_best_one_hot_selection = tf.one_hot(
-            q_tp1_best_using_online_net, policy.action_space.n)
+        q_tp1_best_one_hot_selection = tf.one_hot(q_tp1_best_using_online_net,
+                                                  policy.action_space.n)
        q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1)
        q_dist_tp1_best = tf.reduce_sum(
            q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1)
@@ -246,9 +246,9 @@ def build_q_losses(policy, model, _, train_batch):
    policy.q_loss = QLoss(
        q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best,
        train_batch[PRIO_WEIGHTS], train_batch[SampleBatch.REWARDS],
-        tf.cast(train_batch[SampleBatch.DONES], tf.float32), config["gamma"],
-        config["n_step"], config["num_atoms"],
-        config["v_min"], config["v_max"])
+        tf.cast(train_batch[SampleBatch.DONES],
+                tf.float32), config["gamma"], config["n_step"],
+        config["num_atoms"], config["v_min"], config["v_max"])

    return policy.q_loss.loss

@@ -378,9 +378,8 @@ def postprocess_nstep_and_prio(policy, batch, other_agent=None, episode=None):
            batch[SampleBatch.CUR_OBS], batch[SampleBatch.ACTIONS],
            batch[SampleBatch.REWARDS], batch[SampleBatch.NEXT_OBS],
            batch[SampleBatch.DONES], batch[PRIO_WEIGHTS])
-        new_priorities = (
-            np.abs(convert_to_numpy(td_errors)) +
-            policy.config["prioritized_replay_eps"])
+        new_priorities = (np.abs(convert_to_numpy(td_errors)) +
+                          policy.config["prioritized_replay_eps"])
        batch.data[PRIO_WEIGHTS] = new_priorities

    return batch
@@ -72,12 +72,16 @@ class DQNTorchModel(TorchModelV2, nn.Module):
                advantage_module.add_module(
                    "dueling_A_{}".format(i),
                    NoisyLayer(
-                        ins, n, sigma0=self.sigma0,
+                        ins,
+                        n,
+                        sigma0=self.sigma0,
                        activation=dueling_activation))
                value_module.add_module(
                    "dueling_V_{}".format(i),
                    NoisyLayer(
-                        ins, n, sigma0=self.sigma0,
+                        ins,
+                        n,
+                        sigma0=self.sigma0,
                        activation=dueling_activation))
            else:
                advantage_module.add_module(
@@ -88,25 +92,26 @@ class DQNTorchModel(TorchModelV2, nn.Module):
                    SlimFC(ins, n, activation_fn=dueling_activation))
                # Add LayerNorm after each Dense.
                if add_layer_norm:
-                    advantage_module.add_module(
-                        "LayerNorm_A_{}".format(i), nn.LayerNorm(n))
-                    value_module.add_module(
-                        "LayerNorm_V_{}".format(i), nn.LayerNorm(n))
+                    advantage_module.add_module("LayerNorm_A_{}".format(i),
+                                                nn.LayerNorm(n))
+                    value_module.add_module("LayerNorm_V_{}".format(i),
+                                            nn.LayerNorm(n))
            ins = n

        # Actual Advantages layer (nodes=num-actions).
        if use_noisy:
-            advantage_module.add_module("A", NoisyLayer(
-                ins,
-                self.action_space.n * self.num_atoms,
-                sigma0,
-                activation=None))
+            advantage_module.add_module(
+                "A",
+                NoisyLayer(
+                    ins,
+                    self.action_space.n * self.num_atoms,
+                    sigma0,
+                    activation=None))
        elif q_hiddens:
            advantage_module.add_module(
                "A",
                SlimFC(
-                    ins, action_space.n * self.num_atoms,
-                    activation_fn=None))
+                    ins, action_space.n * self.num_atoms, activation_fn=None))

        self.advantage_module = advantage_module

@@ -212,8 +212,8 @@ def build_q_losses(policy, model, _, train_batch):
        is_training=True)

    # Q scores for actions which we know were selected in the given state.
-    one_hot_selection = F.one_hot(
-        train_batch[SampleBatch.ACTIONS], policy.action_space.n)
+    one_hot_selection = F.one_hot(train_batch[SampleBatch.ACTIONS],
+                                  policy.action_space.n)
    q_t_selected = torch.sum(
        torch.where(q_t > -float("inf"), q_t, torch.tensor(0.0)) *
        one_hot_selection, 1)
@@ -230,8 +230,8 @@ def build_q_losses(policy, model, _, train_batch):
                explore=False,
                is_training=True)
        q_tp1_best_using_online_net = torch.argmax(q_tp1_using_online_net, 1)
-        q_tp1_best_one_hot_selection = F.one_hot(
-            q_tp1_best_using_online_net, policy.action_space.n)
+        q_tp1_best_one_hot_selection = F.one_hot(q_tp1_best_using_online_net,
+                                                 policy.action_space.n)
        q_tp1_best = torch.sum(
            torch.where(q_tp1 > -float("inf"), q_tp1, torch.tensor(0.0)) *
            q_tp1_best_one_hot_selection, 1)
@@ -250,8 +250,8 @@ def build_q_losses(policy, model, _, train_batch):
        q_t_selected, q_logits_t_selected, q_tp1_best, q_probs_tp1_best,
        train_batch[PRIO_WEIGHTS], train_batch[SampleBatch.REWARDS],
        train_batch[SampleBatch.DONES].float(), config["gamma"],
-        config["n_step"], config["num_atoms"],
-        config["v_min"], config["v_max"])
+        config["n_step"], config["num_atoms"], config["v_min"],
+        config["v_max"])

    return policy.q_loss.loss

@@ -222,10 +222,12 @@ def multi_from_logits(behaviour_policy_logits,
        behaviour_policy_logits[i].shape.assert_has_rank(3)
        target_policy_logits[i].shape.assert_has_rank(3)

-    with tf1.name_scope(name, values=[
-        behaviour_policy_logits, target_policy_logits, actions,
-        discounts, rewards, values, bootstrap_value
-    ]):
+    with tf1.name_scope(
+            name,
+            values=[
+                behaviour_policy_logits, target_policy_logits, actions,
+                discounts, rewards, values, bootstrap_value
+            ]):
        target_action_log_probs = multi_log_probs_from_logits_and_actions(
            target_policy_logits, actions, dist_class, model)

@@ -330,16 +332,16 @@ def from_importance_weights(log_rhos,
    if clip_pg_rho_threshold is not None:
        clip_pg_rho_threshold.shape.assert_has_rank(0)

-    with tf1.name_scope(name, values=[
-        log_rhos, discounts, rewards, values, bootstrap_value
-    ]):
+    with tf1.name_scope(
+            name,
+            values=[log_rhos, discounts, rewards, values, bootstrap_value]):
        rhos = tf.math.exp(log_rhos)
        if clip_rho_threshold is not None:
            clipped_rhos = tf.minimum(
                clip_rho_threshold, rhos, name="clipped_rhos")

-            tf1.summary.histogram(
-                    "clipped_rhos_1000", tf.minimum(1000.0, rhos))
+            tf1.summary.histogram("clipped_rhos_1000", tf.minimum(
+                1000.0, rhos))
            tf1.summary.scalar(
                "num_of_clipped_rhos",
                tf.reduce_sum(
@@ -259,13 +259,13 @@ def choose_optimizer(policy, config):
            return tf1.train.AdamOptimizer(policy.cur_lr)
    else:
        if tfv == 2:
-            return tf.keras.optimizers.RMSprop(
-                policy.cur_lr, config["decay"], config["momentum"],
-                config["epsilon"])
+            return tf.keras.optimizers.RMSprop(policy.cur_lr, config["decay"],
+                                               config["momentum"],
+                                               config["epsilon"])
        else:
-            return tf1.train.RMSPropOptimizer(
-                policy.cur_lr, config["decay"], config["momentum"],
-                config["epsilon"])
+            return tf1.train.RMSPropOptimizer(policy.cur_lr, config["decay"],
+                                              config["momentum"],
+                                              config["epsilon"])


 def clip_gradients(policy, optimizer, loss):
@@ -40,23 +40,21 @@ class ReweightedImitationLoss:
        # update averaged advantage norm
        if policy.config["framework"] in ["tf2", "tfe"]:
            policy._ma_adv_norm.assign_add(
-                1e-6 * (tf.reduce_mean(
-                    tf.math.square(adv)) - policy._ma_adv_norm))
+                1e-6 *
+                (tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))
            # Exponentially weighted advantages.
-            exp_advs = tf.math.exp(
-                beta * tf.math.divide(
-                    adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))
+            exp_advs = tf.math.exp(beta * tf.math.divide(
+                adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))
        else:
            update_adv_norm = tf1.assign_add(
                ref=policy._ma_adv_norm,
-                value=1e-6 * (
-                    tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))
+                value=1e-6 *
+                (tf.reduce_mean(tf.math.square(adv)) - policy._ma_adv_norm))

            # exponentially weighted advantages
            with tf1.control_dependencies([update_adv_norm]):
-                exp_advs = tf.math.exp(
-                    beta * tf.math.divide(
-                        adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))
+                exp_advs = tf.math.exp(beta * tf.math.divide(
+                    adv, 1e-8 + tf.math.sqrt(policy._ma_adv_norm)))

        # log\pi_\theta(a|s)
        logprobs = action_dist.logp(actions)
@@ -28,8 +28,8 @@ class TestMARWIL(unittest.TestCase):
        rllib_dir = Path(__file__).parent.parent.parent.parent
        print("rllib dir={}".format(rllib_dir))
        data_file = os.path.join(rllib_dir, "tests/data/cartpole/large.json")
-        print("data_file={} exists={}".format(
-            data_file, os.path.isfile(data_file)))
+        print("data_file={} exists={}".format(data_file,
+                                              os.path.isfile(data_file)))

        config = marwil.DEFAULT_CONFIG.copy()
        config["num_workers"] = 0  # Run locally.
@@ -59,6 +59,7 @@ class TDModel(nn.Module):


 if torch:
+
    class TDDataset(torch.utils.data.Dataset):
        def __init__(self, dataset: SampleBatchType, norms):
            self.count = dataset.count
@@ -26,8 +26,8 @@ def pg_tf_loss(policy, model, dist_class, train_batch):
    logits, _ = model.from_batch(train_batch)
    action_dist = dist_class(logits, model)
    return -tf.reduce_mean(
-        action_dist.logp(train_batch[SampleBatch.ACTIONS]) *
-        tf.cast(train_batch[Postprocessing.ADVANTAGES], dtype=tf.float32))
+        action_dist.logp(train_batch[SampleBatch.ACTIONS]) * tf.cast(
+            train_batch[Postprocessing.ADVANTAGES], dtype=tf.float32))


 PGTFPolicy = build_tf_policy(
@@ -77,13 +77,12 @@ class TestPG(unittest.TestCase):
                    feed_dict=policy._get_loss_inputs_dict(
                        train_batch, shuffle=False))
            else:
-                results = (
-                    pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss
-                )(
-                    policy,
-                    policy.model,
-                    dist_class=dist_cls,
-                    train_batch=train_batch)
+                results = (pg.pg_tf_loss
+                           if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(
+                               policy,
+                               policy.model,
+                               dist_class=dist_cls,
+                               train_batch=train_batch)

            # Calculate expected results.
            if fw != "torch":
@@ -17,7 +17,6 @@ from ray.rllib.utils.numpy import fc
 from ray.rllib.utils.test_utils import check, framework_iterator, \
    check_compute_single_action

-
 # Fake CartPole episode of n time steps.
 FAKE_BATCH = {
    SampleBatch.CUR_OBS: np.array(
@@ -280,14 +280,14 @@ class QMixTorchPolicy(Policy):
            masked_q_values = q_values.clone()
            masked_q_values[avail == 0.0] = -float("inf")
            masked_q_values_folded = torch.reshape(
-                masked_q_values,
-                [-1] + list(masked_q_values.shape)[2:])
+                masked_q_values, [-1] + list(masked_q_values.shape)[2:])
            actions, _ = self.exploration.get_exploration_action(
                action_distribution=TorchCategorical(masked_q_values_folded),
                timestep=timestep,
                explore=explore)
            actions = torch.reshape(
-                actions, list(masked_q_values.shape)[:-1]).cpu().numpy()
+                actions,
+                list(masked_q_values.shape)[:-1]).cpu().numpy()
            hiddens = [s.cpu().numpy() for s in hiddens]

        return tuple(actions.transpose([1, 0])), hiddens, {}
@@ -231,10 +231,8 @@ def sac_actor_critic_loss(policy, model, _, train_batch):
            y_true=q_t_selected_target, y_pred=q_t_selected)
    ]
    if policy.config["twin_q"]:
-        critic_loss.append(
-            0.5 * tf.keras.losses.MSE(
-                y_true=q_t_selected_target,
-                y_pred=twin_q_t_selected))
+        critic_loss.append(0.5 * tf.keras.losses.MSE(
+            y_true=q_t_selected_target, y_pred=twin_q_t_selected))

    # Alpha- and actor losses.
    # Note: In the papers, alpha is used directly, here we take the log.
@@ -281,25 +279,27 @@ def gradients_fn(policy, optimizer, loss):
    if policy.config["framework"] in ["tf2", "tfe"]:
        tape = optimizer.tape
        pol_weights = policy.model.policy_variables()
-        actor_grads_and_vars = list(zip(tape.gradient(
-            policy.actor_loss, pol_weights), pol_weights))
+        actor_grads_and_vars = list(
+            zip(tape.gradient(policy.actor_loss, pol_weights), pol_weights))
        q_weights = policy.model.q_variables()
        if policy.config["twin_q"]:
            half_cutoff = len(q_weights) // 2
-            grads_1 = tape.gradient(
-                policy.critic_loss[0], q_weights[:half_cutoff])
-            grads_2 = tape.gradient(
-                policy.critic_loss[1], q_weights[half_cutoff:])
+            grads_1 = tape.gradient(policy.critic_loss[0],
+                                    q_weights[:half_cutoff])
+            grads_2 = tape.gradient(policy.critic_loss[1],
+                                    q_weights[half_cutoff:])
            critic_grads_and_vars = \
                list(zip(grads_1, q_weights[:half_cutoff])) + \
                list(zip(grads_2, q_weights[half_cutoff:]))
        else:
-            critic_grads_and_vars = list(zip(tape.gradient(
-                policy.critic_loss[0], q_weights), q_weights))
+            critic_grads_and_vars = list(
+                zip(
+                    tape.gradient(policy.critic_loss[0], q_weights),
+                    q_weights))

        alpha_vars = [policy.model.log_alpha]
-        alpha_grads_and_vars = list(zip(tape.gradient(
-            policy.alpha_loss, alpha_vars), alpha_vars))
+        alpha_grads_and_vars = list(
+            zip(tape.gradient(policy.alpha_loss, alpha_vars), alpha_vars))
    # Tf1.x: Use optimizer.compute_gradients()
    else:
        actor_grads_and_vars = policy._actor_optimizer.compute_gradients(
@@ -327,12 +327,15 @@ def gradients_fn(policy, optimizer, loss):
        clip_func = tf.identity

    # Save grads and vars for later use in `build_apply_op`.
-    policy._actor_grads_and_vars = [
-        (clip_func(g), v) for (g, v) in actor_grads_and_vars if g is not None]
-    policy._critic_grads_and_vars = [
-        (clip_func(g), v) for (g, v) in critic_grads_and_vars if g is not None]
-    policy._alpha_grads_and_vars = [
-        (clip_func(g), v) for (g, v) in alpha_grads_and_vars if g is not None]
+    policy._actor_grads_and_vars = [(clip_func(g), v)
+                                    for (g, v) in actor_grads_and_vars
+                                    if g is not None]
+    policy._critic_grads_and_vars = [(clip_func(g), v)
+                                     for (g, v) in critic_grads_and_vars
+                                     if g is not None]
+    policy._alpha_grads_and_vars = [(clip_func(g), v)
+                                    for (g, v) in alpha_grads_and_vars
+                                    if g is not None]

    grads_and_vars = (
        policy._actor_grads_and_vars + policy._critic_grads_and_vars +
@@ -391,15 +394,13 @@ class ActorCriticOptimizerMixin:
            self._actor_optimizer = tf.keras.optimizers.Adam(
                learning_rate=config["optimization"]["actor_learning_rate"])
            self._critic_optimizer = [
-                tf.keras.optimizers.Adam(
-                    learning_rate=config["optimization"][
-                        "critic_learning_rate"])
+                tf.keras.optimizers.Adam(learning_rate=config["optimization"][
+                    "critic_learning_rate"])
            ]
            if config["twin_q"]:
                self._critic_optimizer.append(
-                    tf.keras.optimizers.Adam(
-                        learning_rate=config["optimization"][
-                            "critic_learning_rate"]))
+                    tf.keras.optimizers.Adam(learning_rate=config[
+                        "optimization"]["critic_learning_rate"]))
            self._alpha_optimizer = tf.keras.optimizers.Adam(
                learning_rate=config["optimization"]["entropy_learning_rate"])
        else:
@@ -407,15 +408,13 @@ class ActorCriticOptimizerMixin:
            self._actor_optimizer = tf1.train.AdamOptimizer(
                learning_rate=config["optimization"]["actor_learning_rate"])
            self._critic_optimizer = [
-                tf1.train.AdamOptimizer(
-                    learning_rate=config["optimization"][
-                        "critic_learning_rate"])
+                tf1.train.AdamOptimizer(learning_rate=config["optimization"][
+                    "critic_learning_rate"])
            ]
            if config["twin_q"]:
                self._critic_optimizer.append(
-                    tf1.train.AdamOptimizer(
-                        learning_rate=config["optimization"][
-                            "critic_learning_rate"]))
+                    tf1.train.AdamOptimizer(learning_rate=config[
+                        "optimization"]["critic_learning_rate"]))
            self._alpha_optimizer = tf1.train.AdamOptimizer(
                learning_rate=config["optimization"]["entropy_learning_rate"])

@@ -27,6 +27,7 @@ def to_float_array(v: List[Any]) -> np.ndarray:

 # TODO(sven): Remove the following class once we switch to trajectory view API.

+
@PublicAPI
 class SampleBatchBuilder:
    """Util to build a SampleBatch incrementally.
@@ -76,6 +77,7 @@ class SampleBatchBuilder:

 # TODO(sven): Remove the following class once we switch to trajectory view API.

+
@DeveloperAPI
 class MultiAgentSampleBatchBuilder:
    """Util to build SampleBatches for each policy in a multi-agent env.
@@ -60,12 +60,9 @@ class _SampleCollector(metaclass=ABCMeta):
        raise NotImplementedError

    @abstractmethod
-    def add_action_reward_next_obs(
-            self,
-            episode_id: EpisodeID,
-            agent_id: AgentID,
-            policy_id: PolicyID,
-            values: Dict[str, TensorType]) -> None:
+    def add_action_reward_next_obs(self, episode_id: EpisodeID,
+                                   agent_id: AgentID, policy_id: PolicyID,
+                                   values: Dict[str, TensorType]) -> None:
        """Add the given dictionary (row) of values to this collector.

        The incoming data (`values`) must include action, reward, done, and
@@ -373,26 +373,26 @@ class AsyncSampler(threading.Thread, SamplerInput):
        return extra


-def _env_runner(worker: "RolloutWorker",
-                base_env: BaseEnv,
-                extra_batch_callback: Callable[[SampleBatchType], None],
-                policies: Dict[PolicyID, Policy],
-                policy_mapping_fn: Callable[[AgentID], PolicyID],
-                rollout_fragment_length: int,
-                horizon: int,
-                preprocessors: Dict[PolicyID, Preprocessor],
-                obs_filters: Dict[PolicyID, Filter],
-                clip_rewards: bool,
-                clip_actions: bool,
-                pack_multiple_episodes_in_batch: bool,
-                callbacks: "DefaultCallbacks",
-                tf_sess: Optional["tf.Session"],
-                perf_stats: _PerfStats,
-                soft_horizon: bool,
-                no_done_at_end: bool,
-                observation_fn: "ObservationFunction",
-                _use_trajectory_view_api: bool = False
-                ) -> Iterable[SampleBatchType]:
+def _env_runner(
+        worker: "RolloutWorker",
+        base_env: BaseEnv,
+        extra_batch_callback: Callable[[SampleBatchType], None],
+        policies: Dict[PolicyID, Policy],
+        policy_mapping_fn: Callable[[AgentID], PolicyID],
+        rollout_fragment_length: int,
+        horizon: int,
+        preprocessors: Dict[PolicyID, Preprocessor],
+        obs_filters: Dict[PolicyID, Filter],
+        clip_rewards: bool,
+        clip_actions: bool,
+        pack_multiple_episodes_in_batch: bool,
+        callbacks: "DefaultCallbacks",
+        tf_sess: Optional["tf.Session"],
+        perf_stats: _PerfStats,
+        soft_horizon: bool,
+        no_done_at_end: bool,
+        observation_fn: "ObservationFunction",
+        _use_trajectory_view_api: bool = False) -> Iterable[SampleBatchType]:
    """This implements the common experience collection logic.

    Args:
@@ -571,18 +571,23 @@ def _env_runner(worker: "RolloutWorker",


 def _process_observations(
-        worker: "RolloutWorker", base_env: BaseEnv,
+        worker: "RolloutWorker",
+        base_env: BaseEnv,
        policies: Dict[PolicyID, Policy],
        batch_builder_pool: List[MultiAgentSampleBatchBuilder],
        active_episodes: Dict[str, MultiAgentEpisode],
        unfiltered_obs: Dict[EnvID, Dict[AgentID, EnvObsType]],
        rewards: Dict[EnvID, Dict[AgentID, float]],
        dones: Dict[EnvID, Dict[AgentID, bool]],
-        infos: Dict[EnvID, Dict[AgentID, EnvInfoDict]], horizon: int,
+        infos: Dict[EnvID, Dict[AgentID, EnvInfoDict]],
+        horizon: int,
        preprocessors: Dict[PolicyID, Preprocessor],
-        obs_filters: Dict[PolicyID, Filter], rollout_fragment_length: int,
-        pack_multiple_episodes_in_batch: bool, callbacks: "DefaultCallbacks",
-        soft_horizon: bool, no_done_at_end: bool,
+        obs_filters: Dict[PolicyID, Filter],
+        rollout_fragment_length: int,
+        pack_multiple_episodes_in_batch: bool,
+        callbacks: "DefaultCallbacks",
+        soft_horizon: bool,
+        no_done_at_end: bool,
        observation_fn: "ObservationFunction",
        _use_trajectory_view_api: bool = False
 ) -> Tuple[Set[EnvID], Dict[PolicyID, List[PolicyEvalData]], List[Union[
@@ -931,8 +936,8 @@ def _do_policy_eval(
 def _process_policy_eval_results(
        *,
        to_eval: Dict[PolicyID, List[PolicyEvalData]],
-        eval_results: Dict[PolicyID, Tuple[
-            TensorStructType, StateBatch, dict]],
+        eval_results: Dict[PolicyID, Tuple[TensorStructType, StateBatch,
+                                           dict]],
        active_episodes: Dict[str, MultiAgentEpisode],
        active_envs: Set[int],
        off_policy_actions: MultiEnvDict,
@@ -52,9 +52,9 @@ class RandomEnv(gym.Env):
            done = True
        # Max not reached yet -> Sample done via p_done.
        else:
-            done = bool(np.random.choice(
-                [True, False], p=[self.p_done, 1.0 - self.p_done]
-            ))
+            done = bool(
+                np.random.choice(
+                    [True, False], p=[self.p_done, 1.0 - self.p_done]))

        return self.observation_space.sample(), \
            float(self.reward_space.sample()), done, {}
@@ -18,8 +18,10 @@ class RandomPolicy(Policy):
        if self.config.get("ignore_action_bounds", False) and \
                isinstance(self.action_space, Box):
            self.action_space_for_sampling = Box(
-                -float("inf"), float("inf"),
-                shape=self.action_space.shape, dtype=self.action_space.dtype)
+                -float("inf"),
+                float("inf"),
+                shape=self.action_space.shape,
+                dtype=self.action_space.dtype)
        else:
            self.action_space_for_sampling = self.action_space

@@ -44,8 +44,8 @@ class CustomPolicy(Policy):
                        episodes=None,
                        **kwargs):
        # return random actions
-        return np.array([self.action_space.sample()
-                         for _ in obs_batch]), [], {}
+        return np.array(
+            [self.action_space.sample() for _ in obs_batch]), [], {}

    def learn_on_batch(self, samples):
        # implement your learning code here
@@ -138,10 +138,8 @@ class TrainTFMultiGPU:
        with self.workers.local_worker().tf_sess.graph.as_default():
            with self.workers.local_worker().tf_sess.as_default():
                for policy_id in self.policies:
-                    policy = self.workers.local_worker().get_policy(
-                        policy_id)
-                    with tf1.variable_scope(
-                            policy_id, reuse=tf1.AUTO_REUSE):
+                    policy = self.workers.local_worker().get_policy(policy_id)
+                    with tf1.variable_scope(policy_id, reuse=tf1.AUTO_REUSE):
                        if policy._state_inputs:
                            rnn_inputs = policy._state_inputs + [
                                policy._seq_lens
@@ -150,12 +148,10 @@ class TrainTFMultiGPU:
                            rnn_inputs = []
                        self.optimizers[policy_id] = (
                            LocalSyncParallelOptimizer(
-                                policy._optimizer,
-                                self.devices,
-                                [v for _, v in policy._loss_inputs],
-                                rnn_inputs,
-                                self.per_device_batch_size,
-                                policy.copy))
+                                policy._optimizer, self.devices,
+                                [v
+                                 for _, v in policy._loss_inputs], rnn_inputs,
+                                self.per_device_batch_size, policy.copy))

                self.sess = self.workers.local_worker().tf_sess
                self.sess.run(tf1.global_variables_initializer())
@@ -6,9 +6,6 @@ from ray.rllib.models.tf.layers.skip_connection import SkipConnection
 from ray.rllib.models.tf.layers.multi_head_attention import MultiHeadAttention

 __all__ = [
-    "GRUGate",
-    "MultiHeadAttention",
-    "NoisyLayer",
-    "RelativeMultiHeadAttention",
-    "SkipConnection"
+    "GRUGate", "MultiHeadAttention", "NoisyLayer",
+    "RelativeMultiHeadAttention", "SkipConnection"
 ]
@@ -16,11 +16,7 @@ class NoisyLayer(tf.keras.layers.Layer if tf else object):
    vanish along the training procedure
    """

-    def __init__(self,
-                 prefix,
-                 out_size,
-                 sigma0,
-                 activation="relu"):
+    def __init__(self, prefix, out_size, sigma0, activation="relu"):
        """Initializes a NoisyLayer object.

        Args:
@@ -53,8 +49,7 @@ class NoisyLayer(tf.keras.layers.Layer if tf else object):
            trainable=True,
            tf_name=self.prefix + "_sigma_w",
            shape=[in_size, self.out_size],
-            dtype=tf.float32
-        )
+            dtype=tf.float32)

        self.sigma_b = get_variable(
            value=tf.keras.initializers.Constant(
@@ -81,9 +81,9 @@ class VisionNetwork(TFModelV2):
                        "Given `conv_filters` ({}) do not result in a [B, 1, "
                        "1, {} (`num_outputs`)] shape (but in {})! Please "
                        "adjust your Conv2D stack such that the dims 1 and 2 "
-                        "are both 1.".format(
-                            self.model_config["conv_filters"],
-                            self.num_outputs, list(conv_out.shape)))
+                        "are both 1.".format(self.model_config["conv_filters"],
+                                             self.num_outputs,
+                                             list(conv_out.shape)))

            # num_outputs not known -> Flatten, then set self.num_outputs
            # to the resulting number of nodes.
@@ -67,20 +67,22 @@ class NoisyLayer(nn.Module):
            trainable=True)

    def forward(self, inputs):
-        epsilon_in = self._f_epsilon(torch.normal(
-            mean=torch.zeros([self.in_size]),
-            std=torch.ones([self.in_size])))
-        epsilon_out = self._f_epsilon(torch.normal(
-            mean=torch.zeros([self.out_size]),
-            std=torch.ones([self.out_size])))
+        epsilon_in = self._f_epsilon(
+            torch.normal(
+                mean=torch.zeros([self.in_size]),
+                std=torch.ones([self.in_size])))
+        epsilon_out = self._f_epsilon(
+            torch.normal(
+                mean=torch.zeros([self.out_size]),
+                std=torch.ones([self.out_size])))
        epsilon_w = torch.matmul(
            torch.unsqueeze(epsilon_in, -1),
            other=torch.unsqueeze(epsilon_out, 0))
        epsilon_b = epsilon_out

        action_activation = torch.matmul(
-            inputs, self.w + self.sigma_w * epsilon_w
-        ) + self.b + self.sigma_b * epsilon_b
+            inputs, self.w +
+            self.sigma_w * epsilon_w) + self.b + self.sigma_b * epsilon_b

        if self.activation is not None:
            action_activation = self.activation(action_activation)
@@ -158,9 +158,8 @@ class VisionNetwork(TorchModelV2, nn.Module):
                    "Given `conv_filters` ({}) do not result in a [B, {} "
                    "(`num_outputs`), 1, 1] shape (but in {})! Please adjust "
                    "your Conv2D stack such that the last 2 dims are both "
-                    "1.".format(
-                        self.model_config["conv_filters"], self.num_outputs,
-                        list(conv_out.shape)))
+                    "1.".format(self.model_config["conv_filters"],
+                                self.num_outputs, list(conv_out.shape)))
            logits = conv_out.squeeze(3)
            logits = logits.squeeze(2)

@@ -47,36 +47,36 @@ class DynamicTFPolicy(TFPolicy):
    """

    @DeveloperAPI
-    def __init__(self,
-                 obs_space: gym.spaces.Space,
-                 action_space: gym.spaces.Space,
-                 config: TrainerConfigDict,
-                 loss_fn: Callable[
-                     [Policy, ModelV2, type, SampleBatch], TensorType],
-                 *,
-                 stats_fn: Optional[Callable[[Policy, SampleBatch],
-                                             Dict[str, TensorType]]] = None,
-                 grad_stats_fn: Optional[Callable[
-                     [Policy, SampleBatch, ModelGradients],
-                     Dict[str, TensorType]]] = None,
-                 before_loss_init: Optional[Callable[
-                     [Policy, gym.spaces.Space, gym.spaces.Space,
-                      TrainerConfigDict], None]] = None,
-                 make_model: Optional[Callable[
-                     [Policy, gym.spaces.Space, gym.spaces.Space,
-                      TrainerConfigDict], ModelV2]] = None,
-                 action_sampler_fn: Optional[Callable[
-                     [TensorType, List[TensorType]], Tuple[
-                      TensorType, TensorType]]] = None,
-                 action_distribution_fn: Optional[Callable[
-                     [Policy, ModelV2, TensorType, TensorType, TensorType],
-                     Tuple[TensorType, type, List[TensorType]]]] = None,
-                 existing_inputs: Optional[Dict[
-                     str, "tf1.placeholder"]] = None,
-                 existing_model: Optional[ModelV2] = None,
-                 get_batch_divisibility_req: Optional[Callable[
-                     [Policy], int]] = None,
-                 obs_include_prev_action_reward: bool = True):
+    def __init__(
+            self,
+            obs_space: gym.spaces.Space,
+            action_space: gym.spaces.Space,
+            config: TrainerConfigDict,
+            loss_fn: Callable[[Policy, ModelV2, type, SampleBatch],
+                              TensorType],
+            *,
+            stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[
+                str, TensorType]]] = None,
+            grad_stats_fn: Optional[Callable[[
+                Policy, SampleBatch, ModelGradients
+            ], Dict[str, TensorType]]] = None,
+            before_loss_init: Optional[Callable[[
+                Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+            ], None]] = None,
+            make_model: Optional[Callable[[
+                Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+            ], ModelV2]] = None,
+            action_sampler_fn: Optional[Callable[[
+                TensorType, List[TensorType]
+            ], Tuple[TensorType, TensorType]]] = None,
+            action_distribution_fn: Optional[Callable[[
+                Policy, ModelV2, TensorType, TensorType, TensorType
+            ], Tuple[TensorType, type, List[TensorType]]]] = None,
+            existing_inputs: Optional[Dict[str, "tf1.placeholder"]] = None,
+            existing_model: Optional[ModelV2] = None,
+            get_batch_divisibility_req: Optional[Callable[[Policy],
+                                                          int]] = None,
+            obs_include_prev_action_reward: bool = True):
        """Initialize a dynamic TF policy.

        Arguments:
@@ -641,8 +641,8 @@ def build_eager_tf_policy(name,
                dummy_batch["seq_lens"] = np.array([1], dtype=np.int32)

            # Convert everything to tensors.
-            dummy_batch = tf.nest.map_structure(
-                tf1.convert_to_tensor, dummy_batch)
+            dummy_batch = tf.nest.map_structure(tf1.convert_to_tensor,
+                                                dummy_batch)

            # for IMPALA which expects a certain sample batch size.
            def tile_to(tensor, n):
@@ -46,11 +46,8 @@ class Policy(metaclass=ABCMeta):
    """

    @DeveloperAPI
-    def __init__(
-            self,
-            observation_space: gym.spaces.Space,
-            action_space: gym.spaces.Space,
-            config: TrainerConfigDict):
+    def __init__(self, observation_space: gym.spaces.Space,
+                 action_space: gym.spaces.Space, config: TrainerConfigDict):
        """Initialize the graph.

        This is the standard constructor for policies. The policy
@@ -181,9 +178,9 @@ class Policy(metaclass=ABCMeta):
            episodes = [episode]
        if state is not None:
            state_batch = [
-                s.unsqueeze(0) if torch and isinstance(s, torch.Tensor) else
-                np.expand_dims(s, 0)
-                for s in state
+                s.unsqueeze(0)
+                if torch and isinstance(s, torch.Tensor) else np.expand_dims(
+                    s, 0) for s in state
            ]

        out = self.compute_actions(
@@ -261,10 +258,10 @@ class Policy(metaclass=ABCMeta):
            actions: Union[List[TensorType], TensorType],
            obs_batch: Union[List[TensorType], TensorType],
            state_batches: Optional[List[TensorType]] = None,
-            prev_action_batch: Optional[
-                Union[List[TensorType], TensorType]] = None,
-            prev_reward_batch: Optional[
-                Union[List[TensorType], TensorType]] = None) -> TensorType:
+            prev_action_batch: Optional[Union[List[TensorType],
+                                              TensorType]] = None,
+            prev_reward_batch: Optional[Union[List[
+                TensorType], TensorType]] = None) -> TensorType:
        """Computes the log-prob/likelihood for a given action and observation.

        Args:
@@ -309,8 +306,8 @@ class Policy(metaclass=ABCMeta):
    def postprocess_trajectory(
            self,
            sample_batch: SampleBatch,
-            other_agent_batches: Optional[
-                Dict[AgentID, Tuple["Policy", SampleBatch]]] = None,
+            other_agent_batches: Optional[Dict[AgentID, Tuple[
+                "Policy", SampleBatch]]] = None,
            episode: Optional["MultiAgentEpisode"] = None) -> SampleBatch:
        """Implements algorithm-specific trajectory postprocessing.

@@ -305,10 +305,9 @@ class SampleBatch:
        self.data[key] = item

    @DeveloperAPI
-    def compress(
-            self,
-            bulk: bool = False,
-            columns: Set[str] = frozenset(["obs", "new_obs"])) -> None:
+    def compress(self,
+                 bulk: bool = False,
+                 columns: Set[str] = frozenset(["obs", "new_obs"])) -> None:
        """Compresses the data buffers (by column) in place.

        Args:
@@ -327,10 +326,9 @@ class SampleBatch:
                        [pack(o) for o in self.data[key]])

    @DeveloperAPI
-    def decompress_if_needed(
-            self,
-            columns: Set[str] = frozenset(
-                ["obs", "new_obs"])) -> "SampleBatch":
+    def decompress_if_needed(self,
+                             columns: Set[str] = frozenset(
+                                 ["obs", "new_obs"])) -> "SampleBatch":
        """Decompresses data buffers (per column if not compressed) in place.

        Args:
@@ -374,8 +372,7 @@ class MultiAgentBatch:
    """

    @PublicAPI
-    def __init__(self,
-                 policy_batches: Dict[PolicyID, SampleBatch],
+    def __init__(self, policy_batches: Dict[PolicyID, SampleBatch],
                 env_steps: int):
        """Initialize a MultiAgentBatch object.

@@ -541,11 +538,9 @@ class MultiAgentBatch:
        return sum(b.size_bytes() for b in self.policy_batches.values())

    @DeveloperAPI
-    def compress(
-            self,
-            bulk: bool = False,
-            columns: Set[str] = frozenset(
-                ["obs", "new_obs"])) -> None:
+    def compress(self,
+                 bulk: bool = False,
+                 columns: Set[str] = frozenset(["obs", "new_obs"])) -> None:
        """Compresses each policy batch (per column) in place.

        Args:
@@ -558,10 +553,9 @@ class MultiAgentBatch:
            batch.compress(bulk=bulk, columns=columns)

    @DeveloperAPI
-    def decompress_if_needed(
-            self,
-            columns: Set[str] = frozenset(
-                ["obs", "new_obs"])) -> "MultiAgentBatch":
+    def decompress_if_needed(self,
+                             columns: Set[str] = frozenset(
+                                 ["obs", "new_obs"])) -> "MultiAgentBatch":
        """Decompresses each policy batch (per column), if already compressed.

        Args:
@@ -25,8 +25,9 @@ class TestTrajectoryViewAPI(unittest.TestCase):
            assert len(view_req_model) == 1
            assert len(view_req_policy) == 6
            for key in [
-                SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
-                SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.VF_PREDS
+                    SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
+                    SampleBatch.DONES, SampleBatch.NEXT_OBS,
+                    SampleBatch.VF_PREDS
            ]:
                assert key in view_req_policy
                # None of the view cols has a special underlying data_col,
@@ -53,9 +54,10 @@ class TestTrajectoryViewAPI(unittest.TestCase):
            assert len(view_req_model) == 3  # obs, prev_a, prev_r
            assert len(view_req_policy) == 8
            for key in [
-                SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
-                SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.VF_PREDS,
-                SampleBatch.PREV_ACTIONS, SampleBatch.PREV_REWARDS
+                    SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS,
+                    SampleBatch.DONES, SampleBatch.NEXT_OBS,
+                    SampleBatch.VF_PREDS, SampleBatch.PREV_ACTIONS,
+                    SampleBatch.PREV_REWARDS
            ]:
                assert key in view_req_policy

@@ -65,9 +67,10 @@ class TestTrajectoryViewAPI(unittest.TestCase):
                elif key == SampleBatch.PREV_REWARDS:
                    assert view_req_policy[key].data_col == SampleBatch.REWARDS
                    assert view_req_policy[key].shift == -1
-                elif key not in [SampleBatch.NEXT_OBS,
-                                 SampleBatch.PREV_ACTIONS,
-                                 SampleBatch.PREV_REWARDS]:
+                elif key not in [
+                        SampleBatch.NEXT_OBS, SampleBatch.PREV_ACTIONS,
+                        SampleBatch.PREV_REWARDS
+                ]:
                    assert view_req_policy[key].data_col is None
                else:
                    assert view_req_policy[key].data_col == SampleBatch.OBS
@@ -239,8 +239,7 @@ class TFPolicy(Policy):
        """Returns whether the loss function has been initialized."""
        return self._loss is not None

-    def _initialize_loss(self,
-                         loss: TensorType,
+    def _initialize_loss(self, loss: TensorType,
                         loss_inputs: List[Tuple[str, TensorType]]) -> None:
        """Initializes the loss op from given loss tensor and placeholders.

@@ -264,8 +263,10 @@ class TFPolicy(Policy):
            self._loss = loss

        self._optimizer = self.optimizer()
-        self._grads_and_vars = [(g, v) for (g, v) in self.gradients(
-            self._optimizer, self._loss) if g is not None]
+        self._grads_and_vars = [
+            (g, v) for (g, v) in self.gradients(self._optimizer, self._loss)
+            if g is not None
+        ]
        self._grads = [g for (g, v) in self._grads_and_vars]

        # TODO(sven/ekl): Deprecate support for v1 models.
@@ -336,10 +337,10 @@ class TFPolicy(Policy):
            actions: Union[List[TensorType], TensorType],
            obs_batch: Union[List[TensorType], TensorType],
            state_batches: Optional[List[TensorType]] = None,
-            prev_action_batch: Optional[
-                Union[List[TensorType], TensorType]] = None,
-            prev_reward_batch: Optional[
-                Union[List[TensorType], TensorType]] = None) -> TensorType:
+            prev_action_batch: Optional[Union[List[TensorType],
+                                              TensorType]] = None,
+            prev_reward_batch: Optional[Union[List[
+                TensorType], TensorType]] = None) -> TensorType:

        if self._log_likelihood is None:
            raise ValueError("Cannot compute log-prob/likelihood w/o a "
@@ -378,8 +379,8 @@ class TFPolicy(Policy):

    @override(Policy)
    @DeveloperAPI
-    def learn_on_batch(self, postprocessed_batch: SampleBatch) -> Dict[
-            str, TensorType]:
+    def learn_on_batch(
+            self, postprocessed_batch: SampleBatch) -> Dict[str, TensorType]:
        assert self.loss_initialized()
        builder = TFRunBuilder(self._sess, "learn_on_batch")
        fetches = self._build_learn_on_batch(builder, postprocessed_batch)
@@ -457,7 +458,8 @@ class TFPolicy(Policy):

    @override(Policy)
    @DeveloperAPI
-    def export_checkpoint(self, export_dir: str,
+    def export_checkpoint(self,
+                          export_dir: str,
                          filename_prefix: str = "model") -> None:
        """Export tensorflow checkpoint to export_dir."""
        try:
@@ -573,8 +575,7 @@ class TFPolicy(Policy):
            return tf1.train.AdamOptimizer()

    @DeveloperAPI
-    def gradients(self,
-                  optimizer: "tf.keras.optimizers.Optimizer",
+    def gradients(self, optimizer: "tf.keras.optimizers.Optimizer",
                  loss: TensorType) -> List[Tuple[TensorType, TensorType]]:
        """Override this for a custom gradient computation behavior.

@@ -816,8 +817,7 @@ class LearningRateSchedule:

    @DeveloperAPI
    def __init__(self, lr, lr_schedule):
-        self.cur_lr = tf1.get_variable(
-            "lr", initializer=lr, trainable=False)
+        self.cur_lr = tf1.get_variable("lr", initializer=lr, trainable=False)
        if lr_schedule is None:
            self.lr_schedule = ConstantSchedule(lr, framework=None)
        else:
@@ -843,7 +843,9 @@ class EntropyCoeffSchedule:
    @DeveloperAPI
    def __init__(self, entropy_coeff, entropy_coeff_schedule):
        self.entropy_coeff = get_variable(
-            entropy_coeff, framework="tf", tf_name="entropy_coeff",
+            entropy_coeff,
+            framework="tf",
+            tf_name="entropy_coeff",
            trainable=False)

        if entropy_coeff_schedule is None:
@@ -13,58 +13,52 @@ from ray.rllib.utils.types import ModelGradients, TensorType, TrainerConfigDict


@DeveloperAPI
-def build_tf_policy(name: str,
-                    *,
-                    loss_fn: Callable[
-                        [Policy, ModelV2, type, SampleBatch], TensorType],
-                    get_default_config: Optional[
-                        Callable[[None], TrainerConfigDict]] = None,
-                    postprocess_fn: Optional[Callable[
-                        [Policy, SampleBatch, List[SampleBatch],
-                         "MultiAgentEpisode"], None]] = None,
-                    stats_fn: Optional[Callable[
-                        [Policy, SampleBatch], Dict[str, TensorType]]] = None,
-                    optimizer_fn: Optional[Callable[
-                        [Policy, TrainerConfigDict],
-                        "tf.keras.optimizers.Optimizer"]] = None,
-                    gradients_fn: Optional[Callable[
-                        [Policy, "tf.keras.optimizers.Optimizer",
-                         TensorType], ModelGradients]] = None,
-                    apply_gradients_fn: Optional[Callable[
-                        [Policy, "tf.keras.optimizers.Optimizer",
-                         ModelGradients], "tf.Operation"]] = None,
-                    grad_stats_fn: Optional[Callable[
-                        [Policy, SampleBatch, ModelGradients],
-                        Dict[str, TensorType]]] = None,
-                    extra_action_fetches_fn: Optional[Callable[
-                        [Policy], Dict[str, TensorType]]] = None,
-                    extra_learn_fetches_fn: Optional[Callable[
-                        [Policy], Dict[str, TensorType]]] = None,
-                    validate_spaces: Optional[Callable[
-                        [Policy, gym.Space, gym.Space, TrainerConfigDict],
-                        None]] = None,
-                    before_init: Optional[Callable[
-                        [Policy, gym.Space, gym.Space, TrainerConfigDict],
-                        None]] = None,
-                    before_loss_init: Optional[Callable[
-                        [Policy, gym.spaces.Space, gym.spaces.Space,
-                         TrainerConfigDict], None]] = None,
-                    after_init: Optional[Callable[
-                        [Policy, gym.Space, gym.Space, TrainerConfigDict],
-                        None]] = None,
-                    make_model: Optional[Callable[
-                        [Policy, gym.spaces.Space, gym.spaces.Space,
-                         TrainerConfigDict], ModelV2]] = None,
-                    action_sampler_fn: Optional[Callable[
-                        [TensorType, List[TensorType]], Tuple[
-                            TensorType, TensorType]]] = None,
-                    action_distribution_fn: Optional[Callable[
-                        [Policy, ModelV2, TensorType, TensorType, TensorType],
-                        Tuple[TensorType, type, List[TensorType]]]] = None,
-                    mixins: Optional[List[type]] = None,
-                    get_batch_divisibility_req: Optional[Callable[
-                        [Policy], int]] = None,
-                    obs_include_prev_action_reward: bool = True):
+def build_tf_policy(
+        name: str,
+        *,
+        loss_fn: Callable[[Policy, ModelV2, type, SampleBatch], TensorType],
+        get_default_config: Optional[Callable[[None],
+                                              TrainerConfigDict]] = None,
+        postprocess_fn: Optional[Callable[[
+            Policy, SampleBatch, List[SampleBatch], "MultiAgentEpisode"
+        ], None]] = None,
+        stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[
+            str, TensorType]]] = None,
+        optimizer_fn: Optional[Callable[[
+            Policy, TrainerConfigDict
+        ], "tf.keras.optimizers.Optimizer"]] = None,
+        gradients_fn: Optional[Callable[[
+            Policy, "tf.keras.optimizers.Optimizer", TensorType
+        ], ModelGradients]] = None,
+        apply_gradients_fn: Optional[Callable[[
+            Policy, "tf.keras.optimizers.Optimizer", ModelGradients
+        ], "tf.Operation"]] = None,
+        grad_stats_fn: Optional[Callable[[Policy, SampleBatch, ModelGradients],
+                                         Dict[str, TensorType]]] = None,
+        extra_action_fetches_fn: Optional[Callable[[Policy], Dict[
+            str, TensorType]]] = None,
+        extra_learn_fetches_fn: Optional[Callable[[Policy], Dict[
+            str, TensorType]]] = None,
+        validate_spaces: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_loss_init: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], None]] = None,
+        after_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        make_model: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], ModelV2]] = None,
+        action_sampler_fn: Optional[Callable[[TensorType, List[
+            TensorType]], Tuple[TensorType, TensorType]]] = None,
+        action_distribution_fn: Optional[Callable[[
+            Policy, ModelV2, TensorType, TensorType, TensorType
+        ], Tuple[TensorType, type, List[TensorType]]]] = None,
+        mixins: Optional[List[type]] = None,
+        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None,
+        obs_include_prev_action_reward: bool = True):
    """Helper function for creating a dynamic tf policy at runtime.

    Functions will be run in this order to initialize the policy:
@@ -19,62 +19,51 @@ torch, _ = try_import_torch()


@DeveloperAPI
-def build_torch_policy(name: str,
-                       *,
-                       loss_fn: Callable[
-                           [Policy, ModelV2, type, SampleBatch], TensorType],
-                       get_default_config: Optional[Callable[
-                           [], TrainerConfigDict]] = None,
-                       stats_fn: Optional[Callable[
-                           [Policy, SampleBatch],
-                           Dict[str, TensorType]]] = None,
-                       postprocess_fn: Optional[Callable[
-                            [Policy, SampleBatch, List[SampleBatch],
-                             "MultiAgentEpisode"], None]] = None,
-                       extra_action_out_fn: Optional[Callable[
-                           [Policy, Dict[str, TensorType], List[TensorType],
-                            ModelV2, TorchDistributionWrapper],
-                           Dict[str, TensorType]]] = None,
-                       extra_grad_process_fn: Optional[Callable[
-                           [Policy, "torch.optim.Optimizer", TensorType],
-                           Dict[str, TensorType]]] = None,
-                       # TODO: (sven) Replace "fetches" with "process".
-                       extra_learn_fetches_fn: Optional[Callable[
-                           [Policy], Dict[str, TensorType]]] = None,
-                       optimizer_fn: Optional[Callable[
-                           [Policy, TrainerConfigDict],
-                           "torch.optim.Optimizer"]] = None,
-                       validate_spaces: Optional[Callable[
-                           [Policy, gym.Space, gym.Space, TrainerConfigDict],
-                           None]] = None,
-                       before_init: Optional[Callable[
-                           [Policy, gym.Space, gym.Space, TrainerConfigDict],
-                           None]] = None,
-                       after_init: Optional[Callable[
-                           [Policy, gym.Space, gym.Space, TrainerConfigDict],
-                           None]] = None,
-                       action_sampler_fn: Optional[Callable[
-                           [TensorType, List[TensorType]], Tuple[
-                               TensorType, TensorType]]] = None,
-                       action_distribution_fn: Optional[Callable[
-                           [Policy, ModelV2, TensorType, TensorType,
-                            TensorType],
-                           Tuple[TensorType, type, List[TensorType]]]] = None,
-                       make_model: Optional[Callable[
-                           [Policy, gym.spaces.Space, gym.spaces.Space,
-                            TrainerConfigDict], ModelV2]] = None,
-                       make_model_and_action_dist: Optional[Callable[
-                           [Policy, gym.spaces.Space, gym.spaces.Space,
-                            TrainerConfigDict],
-                           Tuple[ModelV2, TorchDistributionWrapper]]] = None,
-                       apply_gradients_fn: Optional[Callable[
-                           [Policy, "torch.optim.Optimizer"], None]] = None,
-                       mixins: Optional[List[type]] = None,
-                       training_view_requirements_fn: Optional[Callable[
-                           [], Dict[str, ViewRequirement]]] = None,
-                       get_batch_divisibility_req: Optional[Callable[
-                           [Policy], int]] = None
-                       ):
+def build_torch_policy(
+        name: str,
+        *,
+        loss_fn: Callable[[Policy, ModelV2, type, SampleBatch], TensorType],
+        get_default_config: Optional[Callable[[], TrainerConfigDict]] = None,
+        stats_fn: Optional[Callable[[Policy, SampleBatch], Dict[
+            str, TensorType]]] = None,
+        postprocess_fn: Optional[Callable[[
+            Policy, SampleBatch, List[SampleBatch], "MultiAgentEpisode"
+        ], None]] = None,
+        extra_action_out_fn: Optional[Callable[[
+            Policy, Dict[str, TensorType], List[TensorType], ModelV2,
+            TorchDistributionWrapper
+        ], Dict[str, TensorType]]] = None,
+        extra_grad_process_fn: Optional[Callable[[
+            Policy, "torch.optim.Optimizer", TensorType
+        ], Dict[str, TensorType]]] = None,
+        # TODO: (sven) Replace "fetches" with "process".
+        extra_learn_fetches_fn: Optional[Callable[[Policy], Dict[
+            str, TensorType]]] = None,
+        optimizer_fn: Optional[Callable[[Policy, TrainerConfigDict],
+                                        "torch.optim.Optimizer"]] = None,
+        validate_spaces: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        before_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        after_init: Optional[Callable[
+            [Policy, gym.Space, gym.Space, TrainerConfigDict], None]] = None,
+        action_sampler_fn: Optional[Callable[[TensorType, List[
+            TensorType]], Tuple[TensorType, TensorType]]] = None,
+        action_distribution_fn: Optional[Callable[[
+            Policy, ModelV2, TensorType, TensorType, TensorType
+        ], Tuple[TensorType, type, List[TensorType]]]] = None,
+        make_model: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], ModelV2]] = None,
+        make_model_and_action_dist: Optional[Callable[[
+            Policy, gym.spaces.Space, gym.spaces.Space, TrainerConfigDict
+        ], Tuple[ModelV2, TorchDistributionWrapper]]] = None,
+        apply_gradients_fn: Optional[Callable[
+            [Policy, "torch.optim.Optimizer"], None]] = None,
+        mixins: Optional[List[type]] = None,
+        training_view_requirements_fn: Optional[Callable[[], Dict[
+            str, ViewRequirement]]] = None,
+        get_batch_divisibility_req: Optional[Callable[[Policy], int]] = None):
    """Helper function for creating a torch policy class at runtime.

    Args:
@@ -147,7 +147,7 @@ class ModelCatalogTest(unittest.TestCase):
        self.assertEqual(param_shape, action_space.shape)

        # test the class works as a distribution
-        dist_input = tf1.placeholder(tf.float32, (None,) + param_shape)
+        dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape)
        model = Model()
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
@@ -161,7 +161,7 @@ class ModelCatalogTest(unittest.TestCase):
        dist_cls, param_shape = ModelCatalog.get_action_dist(
            action_space, model_config)
        self.assertEqual(param_shape, (3, ))
-        dist_input = tf1.placeholder(tf.float32, (None,) + param_shape)
+        dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape)
        model.model_config = model_config
        dist = dist_cls(dist_input, model=model)
        self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:])
@@ -76,7 +76,10 @@ class TestEagerSupportPG(unittest.TestCase):

    def test_impala(self):
        check_support(
-            "IMPALA", {"num_workers": 1, "num_gpus": 0}, test_eager=True)
+            "IMPALA", {
+                "num_workers": 1,
+                "num_gpus": 0
+            }, test_eager=True)


 class TestEagerSupportOffPolicy(unittest.TestCase):
@@ -130,5 +133,6 @@ if __name__ == "__main__":
    # None for all unittest.TestCase classes in this file.
    import pytest
    class_ = sys.argv[1] if len(sys.argv) > 1 else None
-    sys.exit(pytest.main(
-        ["-v", __file__ + ("" if class_ is None else "::" + class_)]))
+    sys.exit(
+        pytest.main(
+            ["-v", __file__ + ("" if class_ is None else "::" + class_)]))
@@ -21,32 +21,34 @@ class TestMultiAgentPendulum(unittest.TestCase):

        # Test for both torch and tf.
        for fw in framework_iterator(frameworks=["torch", "tf"]):
-            trials = run_experiments({
-                "test": {
-                    "run": "PPO",
-                    "env": "multi_agent_pendulum",
-                    "stop": {
-                        "timesteps_total": 500000,
-                        "episode_reward_mean": -300.0,
-                    },
-                    "config": {
-                        "train_batch_size": 2048,
-                        "vf_clip_param": 10.0,
-                        "num_workers": 0,
-                        "num_envs_per_worker": 10,
-                        "lambda": 0.1,
-                        "gamma": 0.95,
-                        "lr": 0.0003,
-                        "sgd_minibatch_size": 64,
-                        "num_sgd_iter": 10,
-                        "model": {
-                            "fcnet_hiddens": [128, 128],
+            trials = run_experiments(
+                {
+                    "test": {
+                        "run": "PPO",
+                        "env": "multi_agent_pendulum",
+                        "stop": {
+                            "timesteps_total": 500000,
+                            "episode_reward_mean": -300.0,
                        },
-                        "batch_mode": "complete_episodes",
-                        "framework": fw,
-                    },
-                }
-            }, verbose=1)
+                        "config": {
+                            "train_batch_size": 2048,
+                            "vf_clip_param": 10.0,
+                            "num_workers": 0,
+                            "num_envs_per_worker": 10,
+                            "lambda": 0.1,
+                            "gamma": 0.95,
+                            "lr": 0.0003,
+                            "sgd_minibatch_size": 64,
+                            "num_sgd_iter": 10,
+                            "model": {
+                                "fcnet_hiddens": [128, 128],
+                            },
+                            "batch_mode": "complete_episodes",
+                            "framework": fw,
+                        },
+                    }
+                },
+                verbose=1)
            if trials[0].last_result["episode_reward_mean"] < -300.0:
                raise ValueError("Did not get to -200 reward",
                                 trials[0].last_result)
@@ -278,7 +278,7 @@ class TestRolloutWorker(unittest.TestCase):

    def test_action_clipping(self):
        from ray.rllib.examples.env.random_env import RandomEnv
-        action_space = gym.spaces.Box(-2.0, 1.0, (3,))
+        action_space = gym.spaces.Box(-2.0, 1.0, (3, ))

        # Clipping: True (clip between Policy's action_space.low/high),
        ev = RolloutWorker(
@@ -125,5 +125,6 @@ if __name__ == "__main__":
    # One can specify the specific TestCase class to run.
    # None for all unittest.TestCase classes in this file.
    class_ = sys.argv[1] if len(sys.argv) > 1 else None
-    sys.exit(pytest.main(
-        ["-v", __file__ + ("" if class_ is None else "::" + class_)]))
+    sys.exit(
+        pytest.main(
+            ["-v", __file__ + ("" if class_ is None else "::" + class_)]))
@@ -203,5 +203,6 @@ if __name__ == "__main__":
    # One can specify the specific TestCase class to run.
    # None for all unittest.TestCase classes in this file.
    class_ = sys.argv[1] if len(sys.argv) > 1 else None
-    sys.exit(pytest.main(
-        ["-v", __file__ + ("" if class_ is None else "::" + class_)]))
+    sys.exit(
+        pytest.main(
+            ["-v", __file__ + ("" if class_ is None else "::" + class_)]))
@@ -99,8 +99,8 @@ class EpsilonGreedy(Exploration):
            tf.random.categorical(random_valid_action_logits, 1), axis=1)

        chose_random = tf.random.uniform(
-            tf.stack([batch_size]),
-            minval=0, maxval=1, dtype=tf.float32) < epsilon
+            tf.stack([batch_size]), minval=0, maxval=1,
+            dtype=tf.float32) < epsilon

        action = tf.cond(
            pred=tf.constant(explore, dtype=tf.bool)
@@ -154,7 +154,7 @@ class GaussianNoise(Exploration):
                scale = self.scale_schedule(self.last_timestep)
                gaussian_sample = scale * torch.normal(
                    mean=torch.zeros(det_actions.size()), std=self.stddev).to(
-                    self.device)
+                        self.device)
                action = torch.min(
                    torch.max(
                        det_actions + gaussian_sample,
@@ -9,23 +9,18 @@ from ray.rllib.utils.test_utils import check, framework_iterator
 class TestParameterNoise(unittest.TestCase):
    def test_ddpg_parameter_noise(self):
        self.do_test_parameter_noise_exploration(
-            ddpg.DDPGTrainer,
-            ddpg.DEFAULT_CONFIG,
-            "Pendulum-v0", {},
+            ddpg.DDPGTrainer, ddpg.DEFAULT_CONFIG, "Pendulum-v0", {},
            np.array([1.0, 0.0, -1.0]))

    def test_dqn_parameter_noise(self):
        self.do_test_parameter_noise_exploration(
-            dqn.DQNTrainer,
-            dqn.DEFAULT_CONFIG,
-            "FrozenLake-v0", {
+            dqn.DQNTrainer, dqn.DEFAULT_CONFIG, "FrozenLake-v0", {
                "is_slippery": False,
                "map_name": "4x4"
-            },
-            np.array(0))
+            }, np.array(0))

-    def do_test_parameter_noise_exploration(
-            self, trainer_cls, config, env, env_config, obs):
+    def do_test_parameter_noise_exploration(self, trainer_cls, config, env,
+                                            env_config, obs):
        """Tests, whether an Agent works with ParameterNoise."""
        core_config = config.copy()
        core_config["num_workers"] = 0  # Run locally.
@@ -200,9 +200,13 @@ def get_variable(value,
            if isinstance(value, float) else tf.int32
            if isinstance(value, int) else None)
        return tf.compat.v1.get_variable(
-            tf_name, initializer=value, dtype=dtype, trainable=trainable,
-            **({} if shape is None else {"shape": shape})
-        )
+            tf_name,
+            initializer=value,
+            dtype=dtype,
+            trainable=trainable,
+            **({} if shape is None else {
+                "shape": shape
+            }))
    elif framework == "torch" and torch_tensor is True:
        torch, _ = try_import_torch()
        var_ = torch.from_numpy(value)
@@ -53,7 +53,8 @@ class TestSchedules(unittest.TestCase):
    def test_polynomial_schedule(self):
        ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000]
        expected = [
-            0.5 + (2.0 - 0.5) * (1.0 - min(t, 100) / 100)**2 for t in ts]
+            0.5 + (2.0 - 0.5) * (1.0 - min(t, 100) / 100)**2 for t in ts
+        ]
        config = dict(
            type="ray.rllib.utils.schedules.polynomial_schedule."
            "PolynomialSchedule",
@@ -12,6 +12,7 @@ class FlexDict(gym.spaces.Dict):
       space['key'] = spaces.Box(4,)
    See also: documentation for gym.spaces.Dict
    """
+
    def __init__(self, spaces=None, **spaces_kwargs):
        err = "Use either Dict(spaces=dict(...)) or Dict(foo=x, bar=z)"
        assert (spaces is None) or (not spaces_kwargs), err
@@ -288,8 +288,8 @@ def check_compute_single_action(trainer,
            method_to_test = trainer.compute_action
            # Get the obs-space from Workers.env (not Policy) due to possible
            # pre-processor up front.
-            worker_set = getattr(
-                trainer, "workers", getattr(trainer, "_workers", None))
+            worker_set = getattr(trainer, "workers",
+                                 getattr(trainer, "_workers", None))
            assert worker_set
            if isinstance(worker_set, list):
                obs_space = trainer.get_policy().observation_space
@@ -34,8 +34,8 @@ def minimize_and_clip(optimizer, objective, var_list, clip_val=10.0):

    if tf.executing_eagerly():
        tape = optimizer.tape
-        grads_and_vars = list(zip(list(
-            tape.gradient(objective, var_list)), var_list))
+        grads_and_vars = list(
+            zip(list(tape.gradient(objective, var_list)), var_list))
    else:
        grads_and_vars = optimizer.compute_gradients(
            objective, var_list=var_list)
@@ -15,6 +15,7 @@
 #pragma once

 #include <flatbuffers/flatbuffers.h>
+
 #include <unordered_set>

 #include "ray/common/id.h"
@@ -13,7 +13,6 @@
 // limitations under the License.

 #include "gtest/gtest.h"
-
 #include "ray/common/common_protocol.h"
 #include "ray/common/task/task_spec.h"

@@ -4,7 +4,6 @@
 #include <sstream>

 #include "absl/container/flat_hash_map.h"
-
 #include "ray/common/bundle_spec.h"
 #include "ray/util/logging.h"

@@ -1,7 +1,7 @@
-#include <sstream>
-
 #include "ray/common/task/task_execution_spec.h"

+#include <sstream>
+
 namespace ray {

 size_t TaskExecutionSpecification::NumForwards() const {
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include "ray/common/client_connection.h"
+
 #include <boost/asio.hpp>
 #include <boost/asio/error.hpp>
 #include <list>
@@ -20,8 +22,6 @@
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"

-#include "ray/common/client_connection.h"
-
 namespace ray {
 namespace raylet {

@@ -16,11 +16,11 @@

 #include <jni.h>

+#include "jni_utils.h"
 #include "ray/common/id.h"
 #include "ray/core_worker/actor_handle.h"
 #include "ray/core_worker/common.h"
 #include "ray/core_worker/core_worker.h"
-#include "jni_utils.h"

 #ifdef __cplusplus
 extern "C" {
@@ -13,11 +13,13 @@
 // limitations under the License.

 #include "io_ray_runtime_context_NativeWorkerContext.h"
+
 #include <jni.h>
+
+#include "jni_utils.h"
 #include "ray/common/id.h"
 #include "ray/core_worker/context.h"
 #include "ray/core_worker/core_worker.h"
-#include "jni_utils.h"

 #ifdef __cplusplus
 extern "C" {
@@ -16,8 +16,8 @@

 #include <jni.h>

-#include "ray/core_worker/common.h"
 #include "jni_utils.h"
+#include "ray/core_worker/common.h"
 #include "ray/gcs/gcs_client/global_state_accessor.h"

 #ifdef __cplusplus
@@ -13,13 +13,14 @@
 // limitations under the License.

 #include "io_ray_runtime_metric_NativeMetric.h"
-#include "jni_utils.h"
-#include "ray/stats/metric.h"

 #include <jni.h>

 #include <algorithm>
+
+#include "jni_utils.h"
 #include "opencensus/tags/tag_key.h"
+#include "ray/stats/metric.h"

 using TagKeyType = opencensus::tags::TagKey;
 using TagsType = std::vector<std::pair<opencensus::tags::TagKey, std::string>>;
@@ -13,7 +13,9 @@
 // limitations under the License.

 #include "io_ray_runtime_object_NativeObjectStore.h"
+
 #include <jni.h>
+
 #include "jni_utils.h"
 #include "ray/common/id.h"
 #include "ray/core_worker/common.h"
@@ -13,11 +13,13 @@
 // limitations under the License.

 #include "io_ray_runtime_task_NativeTaskExecutor.h"
+
 #include <jni.h>
+
+#include "jni_utils.h"
 #include "ray/common/id.h"
 #include "ray/core_worker/common.h"
 #include "ray/core_worker/core_worker.h"
-#include "jni_utils.h"
 #include "ray/raylet_client/raylet_client.h"

 #ifdef __cplusplus
@@ -15,6 +15,7 @@
 #pragma once

 #include <jni.h>
+
 #include <algorithm>

 #include "ray/common/buffer.h"
@@ -346,7 +347,7 @@ inline jobject NativeVectorToJavaList(
      env->NewObject(java_array_list_class, java_array_list_init_with_capacity,
                     (jint)native_vector.size());
  RAY_CHECK_JAVA_EXCEPTION(env);
-  for (auto it = native_vector.begin(); it != native_vector.end(); ++it){
+  for (auto it = native_vector.begin(); it != native_vector.end(); ++it) {
    auto element = element_converter(env, *it);
    env->CallVoidMethod(java_list, java_list_add, element);
    RAY_CHECK_JAVA_EXCEPTION(env);
@@ -13,8 +13,8 @@
 // limitations under the License.

 #include <thread>
-#include "gtest/gtest.h"

+#include "gtest/gtest.h"
 #include "ray/common/test_util.h"
 #include "ray/core_worker/transport/direct_actor_transport.h"

--- a/Show More
+++ b/Show More