diff --git a/ci/stress_tests/.gitignore b/ci/stress_tests/.gitignore new file mode 100644 index 000000000..3f2531080 --- /dev/null +++ b/ci/stress_tests/.gitignore @@ -0,0 +1,4 @@ +*.log +*temporary.yaml +rllib_impala_p36.yaml +sgd_p36.yaml diff --git a/ci/stress_tests/application_cluster_template.yaml b/ci/stress_tests/application_cluster_template.yaml index 9218c2cf7..e96ce4140 100644 --- a/ci/stress_tests/application_cluster_template.yaml +++ b/ci/stress_tests/application_cluster_template.yaml @@ -33,11 +33,8 @@ idle_timeout_minutes: 5 # Cloud-provider specific configuration. provider: type: aws - region: us-west-2 - # Availability zone(s), comma-separated, that nodes may be launched in. - # Nodes are currently spread between zones by a round-robin approach, - # however this implementation detail should not be relied upon. - availability_zone: us-west-2b + region: us-east-1 + availability_zone: us-east-1a # How Ray will authenticate with newly launched nodes. auth: @@ -53,7 +50,7 @@ auth: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances head_node: InstanceType: <<>> - ImageId: ami-0027dfad6168539c7 # Amazon Deep Learning AMI (Ubuntu), Version 21.2 + ImageId: ami-0757fc5a639fe7666 # You can provision additional disk space with a conf as follows BlockDeviceMappings: @@ -69,11 +66,11 @@ head_node: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances worker_nodes: InstanceType: <<>> - ImageId: ami-0027dfad6168539c7 # Amazon Deep Learning AMI (Ubuntu), Version 21.2 + ImageId: ami-0757fc5a639fe7666 # Run workers on spot by default. Comment this out to use on-demand. - InstanceMarketOptions: - MarketType: spot +# InstanceMarketOptions: +# MarketType: spot # Additional options can be found in the boto docs, e.g. # SpotOptions: # MaxPrice: MAX_HOURLY_PRICE @@ -89,11 +86,9 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: - - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_<<>>/bin:$PATH"' >> ~/.bashrc - - ray || wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<>>/<<>>/ray-<<>>-<<>>-manylinux1_x86_64.whl - - rllib || pip install -U ray-<<>>-<<>>-manylinux1_x86_64.whl[rllib] - - pip install tensorflow-gpu==1.12.0 - - echo "sudo halt" | at now + 60 minutes + - wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<>>/<<>>/ray-<<>>-<<>>-manylinux1_x86_64.whl + - source activate tensorflow_p36 && pip install -U ray-<<>>-<<>>-manylinux1_x86_64.whl[rllib] + - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] # Consider uncommenting these if you also want to run apt-get commands during setup # - sudo pkill -9 apt-get || true # - sudo pkill -9 dpkg || true @@ -109,9 +104,9 @@ worker_setup_commands: [] # Command to start ray on the head node. You don't need to change this. head_start_ray_commands: - ray stop - - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml # Command to start ray on worker nodes. You don't need to change this. worker_start_ray_commands: - ray stop - - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076 + - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/ci/stress_tests/run_application_stress_tests.sh b/ci/stress_tests/run_application_stress_tests.sh index 3e2530b06..a1ae63bbb 100755 --- a/ci/stress_tests/run_application_stress_tests.sh +++ b/ci/stress_tests/run_application_stress_tests.sh @@ -74,9 +74,9 @@ test_impala(){ s/<<>>/$RAY_COMMIT/; s/<<>>/$TEST_NAME/; s/<<>>/p3.16xlarge/; - s/<<>>/m5.24xlarge/; - s/<<>>/5/; - s/<<>>/5/; + s/<<>>/m4.16xlarge/; + s/<<>>/9/; + s/<<>>/9/; s/<<>>/$PYTHON_VERSION/; s/<<>>/$WHEEL_STR/;" > "$CLUSTER" @@ -85,10 +85,11 @@ test_impala(){ RLLIB_DIR=../../python/ray/rllib/ ray --logging-level=DEBUG up -y "$CLUSTER" && ray rsync_up "$CLUSTER" $RLLIB_DIR/tuned_examples/ tuned_examples/ && - sleep 1 && - ray --logging-level=DEBUG exec "$CLUSTER" "rllib || true" && - ray --logging-level=DEBUG exec "$CLUSTER" " - rllib train -f tuned_examples/atari-impala-large.yaml --ray-address='localhost:6379' --queue-trials" && + # HACK: the test will deadlock if it scales up slowly, so we have to wait + # for the cluster to be fully launched first. This is because the first + # trial will occupy all the CPU slots if it can, preventing GPU access. + sleep 200 && + ray --logging-level=DEBUG exec "$CLUSTER" "source activate tensorflow_p36 && rllib train -f tuned_examples/atari-impala-large.yaml --ray-address='localhost:6379' --queue-trials" && echo "PASS: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE" } || echo "FAIL: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE" diff --git a/ci/stress_tests/stress_testing_config.yaml b/ci/stress_tests/stress_testing_config.yaml index ae8789630..0e25799b7 100644 --- a/ci/stress_tests/stress_testing_config.yaml +++ b/ci/stress_tests/stress_testing_config.yaml @@ -26,8 +26,8 @@ idle_timeout_minutes: 5 # Cloud-provider specific configuration. provider: type: aws - region: us-west-2 - availability_zone: us-west-2a + region: us-east-1 + availability_zone: us-east-1a # How Ray will authenticate with newly launched nodes. auth: @@ -42,14 +42,14 @@ auth: # For more documentation on available fields, see: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances head_node: - InstanceType: m5.12xlarge - ImageId: ami-0def3275 # Default Ubuntu 16.04 AMI. + InstanceType: m4.16xlarge + ImageId: ami-0757fc5a639fe7666 # Set primary volume to 25 GiB BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: - VolumeSize: 50 + VolumeSize: 100 # Additional options in the boto docs. @@ -58,14 +58,14 @@ head_node: # For more documentation on available fields, see: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances worker_nodes: - InstanceType: m5.large - ImageId: ami-0def3275 # Default Ubuntu 16.04 AMI. + InstanceType: m4.large + ImageId: ami-0757fc5a639fe7666 # Set primary volume to 25 GiB BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: - VolumeSize: 50 + VolumeSize: 100 # Run workers on spot by default. Comment this out to use on-demand. InstanceMarketOptions: @@ -86,14 +86,14 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: # Consider uncommenting these if you run into dpkg locking issues - # - sudo pkill -9 apt-get || true - # - sudo pkill -9 dpkg || true - # - sudo dpkg --configure -a + - sudo pkill -9 apt-get || true + - sudo pkill -9 dpkg || true + - sudo dpkg --configure -a # Install basics. - - sudo apt-get update + - sudo apt-get -qq update - sudo apt-get install -y build-essential curl unzip # Install Anaconda. - - wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true + - wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true - bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true - echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc # # Build Ray. @@ -102,7 +102,6 @@ setup_commands: - pip install boto3==1.4.8 cython==0.29.0 # - cd ray/python; git checkout master; git pull; pip install -e . --verbose - pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<>>/<<>>/ray-<<>>-cp36-cp36m-manylinux1_x86_64.whl - - echo "sudo halt" | at now + 60 minutes # Custom commands that will be run on the head node after common setup. head_setup_commands: [] diff --git a/ci/stress_tests/test_many_tasks.py b/ci/stress_tests/test_many_tasks.py index 99c1de02e..70b36bc19 100644 --- a/ci/stress_tests/test_many_tasks.py +++ b/ci/stress_tests/test_many_tasks.py @@ -10,7 +10,7 @@ import time import ray -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) ray.init(address="localhost:6379") @@ -24,8 +24,12 @@ num_remote_cpus = num_remote_nodes * head_node_cpus # Wait until the expected number of nodes have joined the cluster. while True: - if len(ray.nodes()) >= num_remote_nodes + 1: + num_nodes = len(ray.nodes()) + logger.info("Waiting for nodes {}/{}".format(num_nodes, + num_remote_nodes + 1)) + if num_nodes >= num_remote_nodes + 1: break + time.sleep(5) logger.info("Nodes have all joined. There are %s resources.", ray.cluster_resources()) @@ -74,10 +78,13 @@ logger.info("Finished after %s seconds.", time.time() - start_time) # Submit a bunch of small tasks to each actor. (approximately 1070 seconds) start_time = time.time() logger.info("Submitting many small actor tasks.") -x_ids = [] -for _ in range(100000): - x_ids = [a.method.remote(0) for a in actors] -ray.get(x_ids) +for N in [1000, 100000]: + x_ids = [] + for i in range(N): + x_ids = [a.method.remote(0) for a in actors] + if i % 100 == 0: + logger.info("Submitted {}".format(i * len(actors))) + ray.get(x_ids) logger.info("Finished after %s seconds.", time.time() - start_time) # TODO(rkn): The test below is commented out because it currently does not diff --git a/doc/dev/RELEASE_PROCESS.rst b/doc/dev/RELEASE_PROCESS.rst index 087d4d92f..3c208adb5 100644 --- a/doc/dev/RELEASE_PROCESS.rst +++ b/doc/dev/RELEASE_PROCESS.rst @@ -3,12 +3,19 @@ Release Process This document describes the process for creating new releases. -1. **Increment the Python version:** Create a PR that increments the Python - package version. See `this example`_. +1. **Create a release branch:** Create the branch from the desired commit on master + In order to create the branch, locally checkout the commit ID i.e., + ``git checkout ``. Then checkout a new branch of the format + ``releases/``. Then push that branch to the ray repo: + ``git push upstream releases/``. -2. **Bump version on Ray master branch again:** Create a pull request to - increment the version of the master branch, see `this PR`_. The format of - the new version is as follows: +2. **Update the release branch version:** Push a commit that increments the Python + package version in python/ray/__init__.py. You can push this directly to the + release branch. + +3. **Update the master branch version:** Create a pull request to + increment the version of the master branch, see `this PR`_. + The format of the new version is as follows: New minor release (e.g., 0.7.0): Increment the minor version and append ``.dev0`` to the version. For example, if the version of the new release is @@ -26,12 +33,6 @@ This document describes the process for creating new releases. in the documentation keep working and the master stays on the development version. -3. **Create a release branch:** Create the branch from the version bump PR (the - one from step 1, not step 2). In order to create the branch, locally checkout - the commit ID i.e., ``git checkout ``. Then checkout a new branch of - the format ``releases/``. Then push that branch to the ray - repo: ``git push upstream releases/``. - 4. **Testing:** Before a release is created, significant testing should be done. Run the following scripts @@ -44,6 +45,17 @@ This document describes the process for creating new releases. This will use the autoscaler to start a bunch of machines and run some tests. **Caution!**: By default, the stress tests will require expensive GPU instances. + You'll also want to kick off the long-running tests: + + .. code-block:: bash + + ray/ci/long_running_tests/start_workloads.sh + + You can use the `check_workloads.sh` script to verify the workloads are running. + Let them run for at least 24 hours, and check them again. They should all still + be running (printing new iterations), and their CPU load should be stable when + you view them in the AWS monitoring console (not increasing over time). + 5. **Resolve release-blockers:** If a release blocking issue arises, there are two ways the issue can be resolved: 1) Fix the issue on the master branch and cherry-pick the relevant commit (using ``git cherry-pick``) onto the release @@ -129,7 +141,7 @@ This document describes the process for creating new releases. At the end of the release note, you can add a list of contributors that help - creating this release. Use the ``dev/get_contributors.py`` to generate this + creating this release. Use the ``doc/dev/get_contributors.py`` to generate this list. You will need to create a GitHub token for this task. Example usage: .. code-block:: bash @@ -141,6 +153,13 @@ This document describes the process for creating new releases. --prev-branch="ray-0.7.1" \ --curr-branch="ray-0.7.2" + Run `ray microbenchmark` to get the latest microbenchmark numbers, and + update their numbers in `profiling.rst`. + + .. code-block:: bash + + ray microbenchmark + 10. **Update version numbers throughout codebase:** Suppose we just released 0.7.1. The previous release version number (in this case 0.7.0) and the previous dev version number (in this case 0.8.0.dev0) appear in many places diff --git a/doc/source/profiling.rst b/doc/source/profiling.rst index 55ed8de6f..3d13860ee 100644 --- a/doc/source/profiling.rst +++ b/doc/source/profiling.rst @@ -52,6 +52,31 @@ documentation: .. image:: http://goog-perftools.sourceforge.net/doc/pprof-test-big.gif +Running Microbenchmarks +----------------------- + +To run a set of single-node Ray microbenchmarks, use: + +.. code-block:: bash + + ray microbenchmark + +The following are the results for the 0.7.5 release on a Python 3 / a m4.16xl instance: + +.. code-block:: text + + single core get calls per second 12169.8 +- 386.41 + single core put calls per second 3117.45 +- 94.17 + single core put gigabytes per second 11.32 +- 3.4 + multi core put calls per second 16221.06 +- 895.13 + multi core put gigabytes per second 24.14 +- 0.29 + single core tasks sync per second 887.77 +- 3.69 + single core tasks async per second 4524.45 +- 196.39 + multi core tasks async per second 6963.49 +- 161.31 + single core actor calls sync per second 762.4 +- 56.47 + single core actor calls async per second 1030.44 +- 45.42 + multi core actor calls async per second 6065.92 +- 175.05 + References ---------- diff --git a/doc/source/rllib.rst b/doc/source/rllib.rst index 7a5d47a4a..8250117f2 100644 --- a/doc/source/rllib.rst +++ b/doc/source/rllib.rst @@ -5,7 +5,7 @@ RLlib is an open-source library for reinforcement learning that offers both high .. image:: rllib-stack.svg -To get started, take a look over the `custom env example `__ and the `API documentation `__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms `__. +To get started, take a look over the `custom env example `__ and the `API documentation `__. If you're looking to develop custom algorithms with RLlib, also check out `concepts and custom algorithms `__. RLlib in 60 seconds ------------------- diff --git a/python/ray/__init__.py b/python/ray/__init__.py index da985b926..db0d3010c 100644 --- a/python/ray/__init__.py +++ b/python/ray/__init__.py @@ -110,7 +110,7 @@ from ray.actor import method # noqa: E402 from ray.runtime_context import _get_runtime_context # noqa: E402 # Ray version string. -__version__ = "0.8.0.dev4" +__version__ = "0.8.0.dev5" __all__ = [ "global_state", diff --git a/python/ray/ray_perf.py b/python/ray/ray_perf.py new file mode 100644 index 000000000..8926dbb69 --- /dev/null +++ b/python/ray/ray_perf.py @@ -0,0 +1,137 @@ +"""This is the script for `ray microbenchmark`.""" + +import time +import numpy as np +import multiprocessing +import ray + + +@ray.remote +class Actor(object): + def small_value(self): + return 0 + + def small_value_batch(self, n): + ray.get([small_value.remote() for _ in range(n)]) + + +@ray.remote +def small_value(): + return 0 + + +@ray.remote +def small_value_batch(n): + submitted = [small_value.remote() for _ in range(n)] + ray.get(submitted) + return 0 + + +def timeit(name, fn, multiplier=1): + # warmup + start = time.time() + while time.time() - start < 1: + fn() + # real run + stats = [] + for _ in range(4): + start = time.time() + count = 0 + while time.time() - start < 2: + fn() + count += 1 + end = time.time() + stats.append(multiplier * count / (end - start)) + print(name, "per second", round(np.mean(stats), 2), "+-", + round(np.std(stats), 2)) + + +def main(): + ray.init() + value = ray.put(0) + arr = np.zeros(100 * 1024 * 1024, dtype=np.int64) + + def get_small(): + ray.get(value) + + timeit("single core get calls", get_small) + + def put_small(): + ray.put(0) + + timeit("single core put calls", put_small) + + def put_large(): + ray.put(arr) + + timeit("single core put gigabytes", put_large, 8 * 0.1) + + @ray.remote + def do_put_small(): + for _ in range(100): + ray.put(0) + + def put_multi_small(): + ray.get([do_put_small.remote() for _ in range(10)]) + + timeit("multi core put calls", put_multi_small, 1000) + + @ray.remote + def do_put(): + for _ in range(10): + ray.put(np.zeros(10 * 1024 * 1024, dtype=np.int64)) + + def put_multi(): + ray.get([do_put.remote() for _ in range(10)]) + + timeit("multi core put gigabytes", put_multi, 10 * 8 * 0.1) + + def small_task(): + ray.get(small_value.remote()) + + timeit("single core tasks sync", small_task) + + def small_task_async(): + ray.get([small_value.remote() for _ in range(1000)]) + + timeit("single core tasks async", small_task_async, 1000) + + n = 10000 + m = 4 + actors = [Actor.remote() for _ in range(m)] + + def multi_task(): + submitted = [a.small_value_batch.remote(n) for a in actors] + ray.get(submitted) + + timeit("multi core tasks async", multi_task, n * m) + + a = Actor.remote() + + def actor_sync(): + ray.get(a.small_value.remote()) + + timeit("single core actor calls sync", actor_sync) + + a = Actor.remote() + + def actor_async(): + ray.get([a.small_value.remote() for _ in range(1000)]) + + timeit("single core actor calls async", actor_async, 1000) + + n_cpu = multiprocessing.cpu_count() // 2 + a = [Actor.remote() for _ in range(n_cpu)] + + @ray.remote + def work(actors): + ray.get([actors[i % n_cpu].small_value.remote() for i in range(n)]) + + def actor_multi2(): + ray.get([work.remote(a) for _ in range(m)]) + + timeit("multi core actor calls async", actor_multi2, m * n) + + +if __name__ == "__main__": + main() diff --git a/python/ray/resource_spec.py b/python/ray/resource_spec.py index f5d1a4c39..cc572b784 100644 --- a/python/ray/resource_spec.py +++ b/python/ray/resource_spec.py @@ -156,7 +156,7 @@ class ResourceSpec( # Cap memory to avoid memory waste and perf issues on large nodes if (object_store_memory > ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES): - logger.warning( + logger.debug( "Warning: Capping object memory store to {}GB. ".format( ray_constants.DEFAULT_OBJECT_STORE_MAX_MEMORY_BYTES // 1e9) + diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index f6365df51..0d8754fcd 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -758,6 +758,12 @@ done subprocess.call(COMMAND, shell=True) +@cli.command() +def microbenchmark(): + from ray.ray_perf import main + main() + + @cli.command() @click.option( "--redis-address", @@ -791,6 +797,7 @@ cli.add_command(teardown, name="down") cli.add_command(kill_random_node) cli.add_command(get_head_ip, name="get_head_ip") cli.add_command(get_worker_ips) +cli.add_command(microbenchmark) cli.add_command(stack) cli.add_command(timeline) cli.add_command(project_cli)