diff --git a/ci/stress_tests/.gitignore b/ci/stress_tests/.gitignore
new file mode 100644
index 000000000..3f2531080
--- /dev/null
+++ b/ci/stress_tests/.gitignore
@@ -0,0 +1,4 @@
+*.log
+*temporary.yaml
+rllib_impala_p36.yaml
+sgd_p36.yaml
diff --git a/ci/stress_tests/application_cluster_template.yaml b/ci/stress_tests/application_cluster_template.yaml
index 9218c2cf7..e96ce4140 100644
--- a/ci/stress_tests/application_cluster_template.yaml
+++ b/ci/stress_tests/application_cluster_template.yaml
@@ -33,11 +33,8 @@ idle_timeout_minutes: 5
 # Cloud-provider specific configuration.
 provider:
     type: aws
-    region: us-west-2
-    # Availability zone(s), comma-separated, that nodes may be launched in.
-    # Nodes are currently spread between zones by a round-robin approach,
-    # however this implementation detail should not be relied upon.
-    availability_zone: us-west-2b
+    region: us-east-1
+    availability_zone: us-east-1a
 
 # How Ray will authenticate with newly launched nodes.
 auth:
@@ -53,7 +50,7 @@ auth:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 head_node:
     InstanceType: <<<HEAD_TYPE>>>
-    ImageId: ami-0027dfad6168539c7  # Amazon Deep Learning AMI (Ubuntu), Version 21.2
+    ImageId: ami-0757fc5a639fe7666
 
     # You can provision additional disk space with a conf as follows
     BlockDeviceMappings:
@@ -69,11 +66,11 @@ head_node:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
     InstanceType: <<<WORKER_TYPE>>>
-    ImageId: ami-0027dfad6168539c7  # Amazon Deep Learning AMI (Ubuntu), Version 21.2
+    ImageId: ami-0757fc5a639fe7666
 
     # Run workers on spot by default. Comment this out to use on-demand.
-    InstanceMarketOptions:
-        MarketType: spot
+#    InstanceMarketOptions:
+#        MarketType: spot
         # Additional options can be found in the boto docs, e.g.
         #   SpotOptions:
         #       MaxPrice: MAX_HOURLY_PRICE
@@ -89,11 +86,9 @@ file_mounts: {
 
 # List of shell commands to run to set up nodes.
 setup_commands:
-    - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<<PYTHON_VERSION>>>/bin:$PATH"' >> ~/.bashrc
-    - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
-    - rllib || pip install -U ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
-    - pip install tensorflow-gpu==1.12.0
-    - echo "sudo halt" | at now + 60 minutes
+    - wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl
+    - source activate tensorflow_p36 && pip install -U ray-<<<RAY_VERSION>>>-<<<WHEEL_STR>>>-manylinux1_x86_64.whl[rllib]
+    - source activate tensorflow_p36 && pip install ray[rllib] ray[debug]
     # Consider uncommenting these if you also want to run apt-get commands during setup
     # - sudo pkill -9 apt-get || true
     # - sudo pkill -9 dpkg || true
@@ -109,9 +104,9 @@ worker_setup_commands: []
 # Command to start ray on the head node. You don't need to change this.
 head_start_ray_commands:
     - ray stop
-    - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+    - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
 
 # Command to start ray on worker nodes. You don't need to change this.
 worker_start_ray_commands:
     - ray stop
-    - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+    - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
diff --git a/ci/stress_tests/run_application_stress_tests.sh b/ci/stress_tests/run_application_stress_tests.sh
index 3e2530b06..a1ae63bbb 100755
--- a/ci/stress_tests/run_application_stress_tests.sh
+++ b/ci/stress_tests/run_application_stress_tests.sh
@@ -74,9 +74,9 @@ test_impala(){
                 s/<<<RAY_COMMIT>>>/$RAY_COMMIT/;
                 s/<<<CLUSTER_NAME>>>/$TEST_NAME/;
                 s/<<<HEAD_TYPE>>>/p3.16xlarge/;
-                s/<<<WORKER_TYPE>>>/m5.24xlarge/;
-                s/<<<MIN_WORKERS>>>/5/;
-                s/<<<MAX_WORKERS>>>/5/;
+                s/<<<WORKER_TYPE>>>/m4.16xlarge/;
+                s/<<<MIN_WORKERS>>>/9/;
+                s/<<<MAX_WORKERS>>>/9/;
                 s/<<<PYTHON_VERSION>>>/$PYTHON_VERSION/;
                 s/<<<WHEEL_STR>>>/$WHEEL_STR/;" > "$CLUSTER"
 
@@ -85,10 +85,11 @@ test_impala(){
             RLLIB_DIR=../../python/ray/rllib/
             ray --logging-level=DEBUG up -y "$CLUSTER" &&
             ray rsync_up "$CLUSTER" $RLLIB_DIR/tuned_examples/ tuned_examples/ &&
-            sleep 1 &&
-            ray --logging-level=DEBUG exec "$CLUSTER" "rllib || true" &&
-            ray --logging-level=DEBUG exec "$CLUSTER" "
-                rllib train -f tuned_examples/atari-impala-large.yaml --ray-address='localhost:6379' --queue-trials" &&
+            # HACK: the test will deadlock if it scales up slowly, so we have to wait
+            # for the cluster to be fully launched first. This is because the first
+            # trial will occupy all the CPU slots if it can, preventing GPU access.
+            sleep 200 &&
+            ray --logging-level=DEBUG exec "$CLUSTER" "source activate tensorflow_p36 && rllib train -f tuned_examples/atari-impala-large.yaml --ray-address='localhost:6379' --queue-trials" &&
             echo "PASS: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
         } || echo "FAIL: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE"
 
diff --git a/ci/stress_tests/stress_testing_config.yaml b/ci/stress_tests/stress_testing_config.yaml
index ae8789630..0e25799b7 100644
--- a/ci/stress_tests/stress_testing_config.yaml
+++ b/ci/stress_tests/stress_testing_config.yaml
@@ -26,8 +26,8 @@ idle_timeout_minutes: 5
 # Cloud-provider specific configuration.
 provider:
     type: aws
-    region: us-west-2
-    availability_zone: us-west-2a
+    region: us-east-1
+    availability_zone: us-east-1a
 
 # How Ray will authenticate with newly launched nodes.
 auth:
@@ -42,14 +42,14 @@ auth:
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 head_node:
-    InstanceType: m5.12xlarge
-    ImageId: ami-0def3275  # Default Ubuntu 16.04 AMI.
+    InstanceType: m4.16xlarge
+    ImageId: ami-0757fc5a639fe7666
 
     # Set primary volume to 25 GiB
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
-              VolumeSize: 50
+              VolumeSize: 100
 
     # Additional options in the boto docs.
 
@@ -58,14 +58,14 @@ head_node:
 # For more documentation on available fields, see:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
-    InstanceType: m5.large
-    ImageId: ami-0def3275  # Default Ubuntu 16.04 AMI.
+    InstanceType: m4.large
+    ImageId: ami-0757fc5a639fe7666
 
     # Set primary volume to 25 GiB
     BlockDeviceMappings:
         - DeviceName: /dev/sda1
           Ebs:
-              VolumeSize: 50
+              VolumeSize: 100
 
     # Run workers on spot by default. Comment this out to use on-demand.
     InstanceMarketOptions:
@@ -86,14 +86,14 @@ file_mounts: {
 # List of shell commands to run to set up nodes.
 setup_commands:
     # Consider uncommenting these if you run into dpkg locking issues
-    # - sudo pkill -9 apt-get || true
-    # - sudo pkill -9 dpkg || true
-    # - sudo dpkg --configure -a
+    - sudo pkill -9 apt-get || true
+    - sudo pkill -9 dpkg || true
+    - sudo dpkg --configure -a
     # Install basics.
-    - sudo apt-get update
+    - sudo apt-get -qq update
     - sudo apt-get install -y build-essential curl unzip
     # Install Anaconda.
-    - wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true
+    - wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true
     - bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true
     - echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc
     # # Build Ray.
@@ -102,7 +102,6 @@ setup_commands:
     - pip install boto3==1.4.8 cython==0.29.0
     # - cd ray/python; git checkout master; git pull; pip install -e . --verbose
     - pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<<RAY_VERSION>>>/<<<RAY_COMMIT>>>/ray-<<<RAY_VERSION>>>-cp36-cp36m-manylinux1_x86_64.whl
-    - echo "sudo halt" | at now + 60 minutes
 
 # Custom commands that will be run on the head node after common setup.
 head_setup_commands: []
diff --git a/ci/stress_tests/test_many_tasks.py b/ci/stress_tests/test_many_tasks.py
index 99c1de02e..70b36bc19 100644
--- a/ci/stress_tests/test_many_tasks.py
+++ b/ci/stress_tests/test_many_tasks.py
@@ -10,7 +10,7 @@ import time
 
 import ray
 
-logging.basicConfig(level=logging.DEBUG)
+logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 ray.init(address="localhost:6379")
@@ -24,8 +24,12 @@ num_remote_cpus = num_remote_nodes * head_node_cpus
 
 # Wait until the expected number of nodes have joined the cluster.
 while True:
-    if len(ray.nodes()) >= num_remote_nodes + 1:
+    num_nodes = len(ray.nodes())
+    logger.info("Waiting for nodes {}/{}".format(num_nodes,
+                                                 num_remote_nodes + 1))
+    if num_nodes >= num_remote_nodes + 1:
         break
+    time.sleep(5)
 logger.info("Nodes have all joined. There are %s resources.",
             ray.cluster_resources())
 
@@ -74,10 +78,13 @@ logger.info("Finished after %s seconds.", time.time() - start_time)
 # Submit a bunch of small tasks to each actor. (approximately 1070 seconds)
 start_time = time.time()
 logger.info("Submitting many small actor tasks.")
-x_ids = []
-for _ in range(100000):
-    x_ids = [a.method.remote(0) for a in actors]
-ray.get(x_ids)
+for N in [1000, 100000]:
+    x_ids = []
+    for i in range(N):
+        x_ids = [a.method.remote(0) for a in actors]
+        if i % 100 == 0:
+            logger.info("Submitted {}".format(i * len(actors)))
+    ray.get(x_ids)
 logger.info("Finished after %s seconds.", time.time() - start_time)
 
 # TODO(rkn): The test below is commented out because it currently does not
diff --git a/doc/dev/RELEASE_PROCESS.rst b/doc/dev/RELEASE_PROCESS.rst
index 087d4d92f..3c208adb5 100644
--- a/doc/dev/RELEASE_PROCESS.rst
+++ b/doc/dev/RELEASE_PROCESS.rst
@@ -3,12 +3,19 @@ Release Process
 
 This document describes the process for creating new releases.
 
-1. **Increment the Python version:** Create a PR that increments the Python
-   package version. See `this example`_.
+1. **Create a release branch:** Create the branch from the desired commit on master
+   In order to create the branch, locally checkout the commit ID i.e.,
+   ``git checkout <hash>``. Then checkout a new branch of the format
+   ``releases/<release-version>``. Then push that branch to the ray repo:
+   ``git push upstream releases/<release-version>``.
 
-2. **Bump version on Ray master branch again:** Create a pull request to
-   increment the version of the master branch, see `this PR`_. The format of
-   the new version is as follows:
+2. **Update the release branch version:** Push a commit that increments the Python
+   package version in python/ray/__init__.py. You can push this directly to the
+   release branch.
+
+3. **Update the master branch version:** Create a pull request to
+   increment the version of the master branch, see `this PR`_.
+   The format of the new version is as follows:
 
    New minor release (e.g., 0.7.0): Increment the minor version and append
    ``.dev0`` to the version. For example, if the version of the new release is
@@ -26,12 +33,6 @@ This document describes the process for creating new releases.
    in the documentation keep working and the master stays on the development
    version.
 
-3. **Create a release branch:** Create the branch from the version bump PR (the
-   one from step 1, not step 2). In order to create the branch, locally checkout
-   the commit ID i.e., ``git checkout <hash>``. Then checkout a new branch of
-   the format ``releases/<release-version>``. Then push that branch to the ray
-   repo: ``git push upstream releases/<release-version>``.
-
 4. **Testing:** Before a release is created, significant testing should be done.
    Run the following scripts
 
@@ -44,6 +45,17 @@ This document describes the process for creating new releases.
    This will use the autoscaler to start a bunch of machines and run some tests.
    **Caution!**: By default, the stress tests will require expensive GPU instances.
 
+   You'll also want to kick off the long-running tests:
+
+   .. code-block:: bash
+
+       ray/ci/long_running_tests/start_workloads.sh
+
+   You can use the `check_workloads.sh` script to verify the workloads are running.
+   Let them run for at least 24 hours, and check them again. They should all still
+   be running (printing new iterations), and their CPU load should be stable when
+   you view them in the AWS monitoring console (not increasing over time).
+
 5. **Resolve release-blockers:** If a release blocking issue arises, there are
    two ways the issue can be resolved: 1) Fix the issue on the master branch and
    cherry-pick the relevant commit  (using ``git cherry-pick``) onto the release
@@ -129,7 +141,7 @@ This document describes the process for creating new releases.
 
 
     At the end of the release note, you can add a list of contributors that help
-    creating this release. Use the ``dev/get_contributors.py`` to generate this
+    creating this release. Use the ``doc/dev/get_contributors.py`` to generate this
     list. You will need to create a GitHub token for this task. Example usage:
 
     .. code-block:: bash
@@ -141,6 +153,13 @@ This document describes the process for creating new releases.
         --prev-branch="ray-0.7.1" \
         --curr-branch="ray-0.7.2"
 
+    Run `ray microbenchmark` to get the latest microbenchmark numbers, and
+    update their numbers in `profiling.rst`.
+
+    .. code-block:: bash
+
+      ray microbenchmark
+
 10. **Update version numbers throughout codebase:** Suppose we just released
     0.7.1. The previous release version number (in this case 0.7.0) and the
     previous dev version number (in this case 0.8.0.dev0) appear in many places
diff --git a/doc/source/profiling.rst b/doc/source/profiling.rst
index 55ed8de6f..3d13860ee 100644
--- a/doc/source/profiling.rst
+++ b/doc/source/profiling.rst
@@ -52,6 +52,31 @@ documentation:
 
 .. image:: http://goog-perftools.sourceforge.net/doc/pprof-test-big.gif
 
+Running Microbenchmarks
+-----------------------
+
+To run a set of single-node Ray microbenchmarks, use:
+
+.. code-block:: bash
+
+  ray microbenchmark
+
+The following are the results for the 0.7.5 release on a Python 3 / a m4.16xl instance:
+
+.. code-block:: text
+
+  single core get calls per second 12169.8 +- 386.41
+  single core put calls per second 3117.45 +- 94.17
+  single core put gigabytes per second 11.32 +- 3.4
+  multi core put calls per second 16221.06 +- 895.13
+  multi core put gigabytes per second 24.14 +- 0.29
+  single core tasks sync per second 887.77 +- 3.69
+  single core tasks async per second 4524.45 +- 196.39
+  multi core tasks async per second 6963.49 +- 161.31
+  single core actor calls sync per second 762.4 +- 56.47
+  single core actor calls async per second 1030.44 +- 45.42
+  multi core actor calls async per second 6065.92 +- 175.05
+
 References
 ----------
 
diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst
index 7a5d47a4a..8250117f2 100644
--- a/doc/source/rllib.rst
+++ b/doc/source/rllib.rst
@@ -5,7 +5,7 @@ RLlib is an open-source library for reinforcement learning that offers both high
 
 .. image:: rllib-stack.svg
 
-To get started, take a look over the `custom env example <https://github.com/ray-project/ray/blob/master/rllib/examples/custom_env.py>`__ and the `API documentation <rllib-training.html>`__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms <rllib-concepts.html>`__.
+To get started, take a look over the `custom env example <https://github.com/ray-project/ray/blob/master/rllib/examples/custom_env.py>`__ and the `API documentation <rllib-toc.html>`__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms <rllib-concepts.html>`__.
 
 RLlib in 60 seconds
 -------------------
diff --git a/python/ray/__init__.py b/python/ray/__init__.py
index da985b926..db0d3010c 100644
--- a/python/ray/__init__.py
+++ b/python/ray/__init__.py
@@ -110,7 +110,7 @@ from ray.actor import method  # noqa: E402
 from ray.runtime_context import _get_runtime_context  # noqa: E402
 
 # Ray version string.
-__version__ = "0.8.0.dev4"
+__version__ = "0.8.0.dev5"
 
 __all__ = [
     "global_state",
diff --git a/python/ray/ray_perf.py b/python/ray/ray_perf.py
new file mode 100644
index 000000000..8926dbb69
--- /dev/null
+++ b/python/ray/ray_perf.py
@@ -0,0 +1,137 @@
+"""This is the script for `ray microbenchmark`."""
+
+import time
+import numpy as np
+import multiprocessing
+import ray
+
+
+@ray.remote
+class Actor(object):
+    def small_value(self):
+        return 0
+
+    def small_value_batch(self, n):
+        ray.get([small_value.remote() for _ in range(n)])
+
+
+@ray.remote
+def small_value():
+    return 0
+
+
+@ray.remote
+def small_value_batch(n):
+    submitted = [small_value.remote() for _ in range(n)]
+    ray.get(submitted)
+    return 0
+
+
+def timeit(name, fn, multiplier=1):
+    # warmup
+    start = time.time()
+    while time.time() - start < 1:
+        fn()
+    # real run
+    stats = []
+    for _ in range(4):
+        start = time.time()
+        count = 0
+        while time.time() - start < 2:
+            fn()
+            count += 1
+        end = time.time()
+        stats.append(multiplier * count / (end - start))
+    print(name, "per second", round(np.mean(stats), 2), "+-",
+          round(np.std(stats), 2))
+
+
+def main():
+    ray.init()
+    value = ray.put(0)
+    arr = np.zeros(100 * 1024 * 1024, dtype=np.int64)
+
+    def get_small():
+        ray.get(value)
+
+    timeit("single core get calls", get_small)
+
+    def put_small():
+        ray.put(0)
+
+    timeit("single core put calls", put_small)
+
+    def put_large():
+        ray.put(arr)
+
+    timeit("single core put gigabytes", put_large, 8 * 0.1)
+
+    @ray.remote
+    def do_put_small():
+        for _ in range(100):
+            ray.put(0)
+
+    def put_multi_small():
+        ray.get([do_put_small.remote() for _ in range(10)])
+
+    timeit("multi core put calls", put_multi_small, 1000)
+
+    @ray.remote
+    def do_put():
+        for _ in range(10):
+            ray.put(np.zeros(10 * 1024 * 1024, dtype=np.int64))
+
+    def put_multi():
+        ray.get([do_put.remote() for _ in range(10)])
+
+    timeit("multi core put gigabytes", put_multi, 10 * 8 * 0.1)
+
+    def small_task():
+        ray.get(small_value.remote())
+
+    timeit("single core tasks sync", small_task)
+
+    def small_task_async():
+        ray.get([small_value.remote() for _ in range(1000)])
+
+    timeit("single core tasks async", small_task_async, 1000)
+
+    n = 10000
+    m = 4
+    actors = [Actor.remote() for _ in range(m)]
+
+    def multi_task():
+        submitted = [a.small_value_batch.remote(n) for a in actors]
+        ray.get(submitted)
+
+    timeit("multi core tasks async", multi_task, n * m)
+
+    a = Actor.remote()
+
+    def actor_sync():
+        ray.get(a.small_value.remote())
+
+    timeit("single core actor calls sync", actor_sync)
+
+    a = Actor.remote()
+
+    def actor_async():
+        ray.get([a.small_value.remote() for _ in range(1000)])
+
+    timeit("single core actor calls async", actor_async, 1000)
+
+    n_cpu = multiprocessing.cpu_count() // 2
+    a = [Actor.remote() for _ in range(n_cpu)]
+
+    @ray.remote
+    def work(actors):
+        ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)])
+
+    def actor_multi2():
+        ray.get([work.remote(a) for _ in range(m)])
+
+    timeit("multi core actor calls async", actor_multi2, m * n)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/python/ray/resource_spec.py b/python/ray/resource_spec.py
index f5d1a4c39..cc572b784 100644
--- a/python/ray/resource_spec.py
+++ b/python/ray/resource_spec.py
@@ -156,7 +156,7 @@ class ResourceSpec(
             # Cap memory to avoid memory waste and perf issues on large nodes
             if (object_store_memory >
                     ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES):
-                logger.warning(
+                logger.debug(
                     "Warning: Capping object memory store to {}GB. ".format(
                         ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES //
                         1e9) +
diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py
index f6365df51..0d8754fcd 100644
--- a/python/ray/scripts/scripts.py
+++ b/python/ray/scripts/scripts.py
@@ -758,6 +758,12 @@ done
     subprocess.call(COMMAND, shell=True)
 
 
+@cli.command()
+def microbenchmark():
+    from ray.ray_perf import main
+    main()
+
+
 @cli.command()
 @click.option(
     "--redis-address",
@@ -791,6 +797,7 @@ cli.add_command(teardown, name="down")
 cli.add_command(kill_random_node)
 cli.add_command(get_head_ip, name="get_head_ip")
 cli.add_command(get_worker_ips)
+cli.add_command(microbenchmark)
 cli.add_command(stack)
 cli.add_command(timeline)
 cli.add_command(project_cli)