diff --git a/ci/long_running_tests/README.rst b/ci/long_running_tests/README.rst index 1c1389447..171498eef 100644 --- a/ci/long_running_tests/README.rst +++ b/ci/long_running_tests/README.rst @@ -6,8 +6,8 @@ forever until they fail. To set up the project you need to run .. code-block:: bash - pip install any - any project create + pip install anyscale + anyscale project create Running the Workloads @@ -17,21 +17,21 @@ You can start all the workloads with: .. code-block:: bash - any session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl + anyscale session start -y run --workload="*" --wheel=https://s3-us-west-2.amazonaws.com/ray-wheels/releases/0.7.5/6da7eff4b20340f92d3fe1160df35caa68922a97/ray-0.7.5-cp36-cp36m-manylinux1_x86_64.whl This will start one EC2 instance per workload and will start the workloads running (one per instance). You can start a specific workload by specifying -its name as an argument ``--workload=`` instead of ``"*"``. A list of available options -is available via `any session start run --help`. +its name as an argument ``--workload=`` instead of ``"*"``. A list of +available options is available via `any session start run --help`. Check Workload Statuses ----------------------- To check up on the workloads, run either -``any session --name="*" execute check-load``, which +``anyscale session --name="*" execute check-load``, which will print the load on each machine, or -``any session --name="*" execute show-output``, which +``anyscale session --name="*" execute show-output``, which will print the tail of the output for each workload. To debug workloads that have failed, you may find it useful to ssh to the @@ -43,7 +43,7 @@ Shut Down the Workloads ----------------------- The instances running the workloads can all be killed by running -``any session stop --name "*"``. +``anyscale session stop --name "*"``. Adding a Workload ----------------- diff --git a/ci/long_running_tests/workloads/serve.py b/ci/long_running_tests/workloads/serve.py index a0963142d..68ad274cc 100644 --- a/ci/long_running_tests/workloads/serve.py +++ b/ci/long_running_tests/workloads/serve.py @@ -57,15 +57,17 @@ for _ in range(5): time.sleep(0.5) connections = int(config.num_replicas * config.max_batch_size * 0.75) -proc = subprocess.Popen( - [ - "./hey_linux_amd64", "-c", - str(connections), "-z", "360m", "http://127.0.0.1:8000/echo" - ], - stdout=PIPE, - stderr=PIPE) -print("started load testing") -proc.wait() -out, err = proc.communicate() -print(out.decode()) -print(err.decode()) + +while True: + proc = subprocess.Popen( + [ + "./hey_linux_amd64", "-c", + str(connections), "-z", "60m", "http://127.0.0.1:8000/echo" + ], + stdout=PIPE, + stderr=PIPE) + print("started load testing") + proc.wait() + out, err = proc.communicate() + print(out.decode()) + print(err.decode()) diff --git a/ci/microbenchmark/ray-project/cluster.yaml b/ci/microbenchmark/ray-project/cluster.yaml new file mode 100644 index 000000000..67b42efb0 --- /dev/null +++ b/ci/microbenchmark/ray-project/cluster.yaml @@ -0,0 +1,55 @@ +cluster_name: ray-release-microbenchmark +min_workers: 0 +max_workers: 0 +target_utilization_fraction: 0.8 +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a +auth: + ssh_user: ubuntu + +head_node: + InstanceType: m4.16xlarge + ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04 + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 150 + +worker_nodes: + InstanceType: m5.large + ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04 + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 150 + + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + +# List of shell commands to run to set up nodes. +setup_commands: + # Install latest TensorFlow + - echo set-window-option -g mouse on > ~/.tmux.conf + - echo 'termcapinfo xterm* ti@:te@' > ~/.screenrc + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + # Install Anaconda. + - wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true + - bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true + - echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: [] + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: [] diff --git a/ci/microbenchmark/ray-project/project.yaml b/ci/microbenchmark/ray-project/project.yaml new file mode 100644 index 000000000..a76e3e3b9 --- /dev/null +++ b/ci/microbenchmark/ray-project/project.yaml @@ -0,0 +1,39 @@ +name: microbenchmark +description: "Ray's microbenchmark" + +cluster: + config: ray-project/cluster.yaml + +commands: + - name: run + help: "Start one microbenchmark trial." + command: | + rm ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl || true + wget https://s3-us-west-2.amazonaws.com/ray-wheels/{{ray_branch}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl + + pip uninstall -y -q ray + pip install -U ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl + + OMP_NUM_THREADS=64 ray microbenchmark + params: + - name: ray_version # Ray version string. + default: "0.9.0.dev0" + + - name: commit # Ray commit SHA string. + default: "FILL ME IN" + + - name: ray_branch + default: "master" + config: + tmux: true + +# Pathnames for files and directories that should be saved +# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project +# directory or absolute. Generally, this should be files +# that were created by an active session, such as +# application checkpoints and logs. +output_files: [ + # For example, uncomment this to save the logs from the + # last ray job. + # "/tmp/ray/session_latest", +] \ No newline at end of file diff --git a/ci/regression_test/rllib_regresssion_tests/compact-regression-test.yaml b/ci/regression_test/rllib_regresssion_tests/compact-regression-test.yaml new file mode 100644 index 000000000..2c5210548 --- /dev/null +++ b/ci/regression_test/rllib_regresssion_tests/compact-regression-test.yaml @@ -0,0 +1,145 @@ +# This file runs on a single g3.16xl or p3.16xl node. It is suggested +# to run these in a DLAMI / tensorflow_p36 env. Note that RL runs are +# inherently high variance, so you'll have to check to see if the +# rewards reached seem reasonably in line with previous results. +# +# You can find the reference results here: +# https://github.com/ray-project/ray/tree/master/doc/dev/release_logs +atari-impala: + env: BreakoutNoFrameskip-v4 + run: IMPALA + num_samples: 4 + stop: + time_total_s: 3600 + config: + sample_batch_size: 50 + train_batch_size: 500 + num_workers: 10 + num_envs_per_worker: 5 + clip_rewards: True + lr_schedule: [ + [0, 0.0005], + [20000000, 0.000000000001], + ] + num_gpus: 1 +atari-ppo-tf: + env: BreakoutNoFrameskip-v4 + run: PPO + num_samples: 4 + stop: + time_total_s: 3600 + config: + lambda: 0.95 + kl_coeff: 0.5 + clip_rewards: True + clip_param: 0.1 + vf_clip_param: 10.0 + entropy_coeff: 0.01 + train_batch_size: 5000 + sample_batch_size: 100 + sgd_minibatch_size: 500 + num_sgd_iter: 10 + num_workers: 10 + num_envs_per_worker: 5 + batch_mode: truncate_episodes + observation_filter: NoFilter + vf_share_layers: true + num_gpus: 1 +atari-ppo-torch: + env: BreakoutNoFrameskip-v4 + run: PPO + num_samples: 4 + stop: + time_total_s: 3600 + config: + use_pytorch: true, + lambda: 0.95 + kl_coeff: 0.5 + clip_rewards: True + clip_param: 0.1 + vf_clip_param: 10.0 + entropy_coeff: 0.01 + train_batch_size: 5000 + sample_batch_size: 100 + sgd_minibatch_size: 500 + num_sgd_iter: 10 + num_workers: 10 + num_envs_per_worker: 5 + batch_mode: truncate_episodes + observation_filter: NoFilter + vf_share_layers: true + num_gpus: 1 +apex: + env: BreakoutNoFrameskip-v4 + run: APEX + num_samples: 4 + stop: + time_total_s: 3600 + config: + double_q: false + dueling: false + num_atoms: 1 + noisy: false + n_step: 3 + lr: .0001 + adam_epsilon: .00015 + hiddens: [512] + buffer_size: 1000000 + exploration_config: + epsilon_timesteps: 200000 + final_epsilon: 0.01 + prioritized_replay_alpha: 0.5 + final_prioritized_replay_beta: 1.0 + prioritized_replay_beta_annealing_timesteps: 2000000 + num_gpus: 1 + num_workers: 8 + num_envs_per_worker: 8 + sample_batch_size: 20 + train_batch_size: 512 + target_network_update_freq: 50000 + timesteps_per_iteration: 25000 +atari-a2c: + env: BreakoutNoFrameskip-v4 + run: A2C + num_samples: 4 + stop: + time_total_s: 3600 + config: + sample_batch_size: 20 + clip_rewards: True + num_workers: 5 + num_envs_per_worker: 5 + num_gpus: 1 + lr_schedule: [ + [0, 0.0007], + [20000000, 0.000000000001], + ] +atari-basic-dqn: + env: BreakoutNoFrameskip-v4 + run: DQN + num_samples: 4 + stop: + time_total_s: 3600 + config: + double_q: false + dueling: false + num_atoms: 1 + noisy: false + prioritized_replay: false + n_step: 1 + target_network_update_freq: 8000 + lr: .0000625 + adam_epsilon: .00015 + hiddens: [512] + learning_starts: 20000 + buffer_size: 1000000 + sample_batch_size: 4 + train_batch_size: 32 + exploration_config: + epsilon_timesteps: 200000 + final_epsilon: 0.01 + prioritized_replay_alpha: 0.5 + final_prioritized_replay_beta: 1.0 + prioritized_replay_beta_annealing_timesteps: 2000000 + num_gpus: 0.2 + timesteps_per_iteration: 10000 diff --git a/ci/regression_test/rllib_regresssion_tests/ray-project/cluster.yaml b/ci/regression_test/rllib_regresssion_tests/ray-project/cluster.yaml new file mode 100644 index 000000000..a8ace8e07 --- /dev/null +++ b/ci/regression_test/rllib_regresssion_tests/ray-project/cluster.yaml @@ -0,0 +1,43 @@ +cluster_name: ray-rllib-regression-tests + +min_workers: 0 +max_workers: 0 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + availability_zone: us-west-2a + cache_stopped_nodes: False + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu + +head_node: + InstanceType: p3.16xlarge + ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) + + # Set primary volume to 25 GiB + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + + +# List of shell commands to run to set up nodes. +setup_commands: + - wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/{{ray_version}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl + - source activate tensorflow_p36 && pip install -U ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl + - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] + - source activate tensorflow_p36 && pip install boto3==1.4.8 cython==0.29.0 + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - source activate tensorflow_p36 && ray stop + - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - source activate tensorflow_p36 && ray stop + - ulimit -n 65536; source activate tensorflow_p36 && OMP_NUM_THREADS=1 ray start --address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/ci/regression_test/rllib_regresssion_tests/ray-project/project.yaml b/ci/regression_test/rllib_regresssion_tests/ray-project/project.yaml new file mode 100644 index 000000000..3c5cd2401 --- /dev/null +++ b/ci/regression_test/rllib_regresssion_tests/ray-project/project.yaml @@ -0,0 +1,53 @@ +# This file is generated by `ray project create`. + +name: rllib_regression_tests + +# description: A short description of the project. +# The URL of the repo this project is part of. +# repo: ... + +cluster: + config: ray-project/cluster.yaml + params: + - name: ray_version # Ray version string. + default: "0.8.2" + + - name: commit # Ray commit SHA string. + default: "f5a1307a608fe5fdbdb04616b22c91f029af329a" + + +environment: + # dockerfile: The dockerfile to be built and ran the commands with. + # dockerimage: The docker image to be used to run the project in, e.g. ubuntu:18.04. + requirements: ray-project/requirements.txt + + shell: # Shell commands to be ran for environment setup. + - echo "Setting up the environment" + +commands: + - name: check-load + command: uptime + help: "Check load of the workload." + + - name: check-gpu + command: nvidia-smi + help: "Check load of the gpu." + + - name: show-output + command: tmux capture-pane -p + help: "Show tail of the workoad output." + + - name: run-regression-tests + command: source activate tensorflow_p36 && rllib train -f compact-regression-test.yaml + help: "Run rllib regression tests" + +# Pathnames for files and directories that should be saved +# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project +# directory or absolute. Generally, this should be files +# that were created by an active session, such as +# application checkpoints and logs. +output_files: [ + # For example, uncomment this to save the logs from the + # last ray job. + # "/tmp/ray/session_latest", +] \ No newline at end of file diff --git a/ci/regression_test/rllib_regresssion_tests/ray-project/requirements.txt b/ci/regression_test/rllib_regresssion_tests/ray-project/requirements.txt new file mode 100644 index 000000000..69bde8cf2 --- /dev/null +++ b/ci/regression_test/rllib_regresssion_tests/ray-project/requirements.txt @@ -0,0 +1 @@ +ray[rllib] \ No newline at end of file diff --git a/ci/regression_test/rllib_stress_tests/atari_impala_xlarge.yaml b/ci/regression_test/rllib_stress_tests/atari_impala_xlarge.yaml new file mode 100644 index 000000000..8dd3dd4ab --- /dev/null +++ b/ci/regression_test/rllib_stress_tests/atari_impala_xlarge.yaml @@ -0,0 +1,24 @@ +# Taken from rllib/tuned_examples/atari_impala_large.yaml + +# Runs on a g3.16xl node with 5 m5.24xl workers +# Takes roughly 10 minutes. x10? +atari-impala: + env: + grid_search: + - BreakoutNoFrameskip-v4 + - BeamRiderNoFrameskip-v4 + - QbertNoFrameskip-v4 + - SpaceInvadersNoFrameskip-v4 + run: IMPALA + stop: + timesteps_total: 30000000 + config: + sample_batch_size: 50 + train_batch_size: 500 + num_workers: 128 + num_envs_per_worker: 5 + clip_rewards: True + lr_schedule: [ + [0, 0.0005], + [20000000, 0.000000000001], + ] \ No newline at end of file diff --git a/ci/stress_tests/application_cluster_template.yaml b/ci/regression_test/rllib_stress_tests/ray-project/cluster.yaml similarity index 77% rename from ci/stress_tests/application_cluster_template.yaml rename to ci/regression_test/rllib_stress_tests/ray-project/cluster.yaml index 833c52f87..73e3ef5b6 100644 --- a/ci/stress_tests/application_cluster_template.yaml +++ b/ci/regression_test/rllib_stress_tests/ray-project/cluster.yaml @@ -3,22 +3,15 @@ #################################################################### # An unique identifier for the head node and workers of this cluster. -cluster_name: <<>> +cluster_name: ray-rllib-stress-tests # The minimum number of workers nodes to launch in addition to the head # node. This number should be >= 0. -min_workers: <<>> +min_workers: 9 # The maximum number of workers nodes to launch in addition to the head # node. This takes precedence over min_workers. -max_workers: <<>> - -# This executes all commands on all nodes in the docker container, -# and opens all the necessary ports to support the Ray cluster. -# Empty string means disabled. -docker: - image: "" # e.g., tensorflow/tensorflow:1.5.0-py3 - container_name: "" # e.g. ray_docker +max_workers: 9 # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -35,7 +28,7 @@ provider: type: aws region: us-west-2 availability_zone: us-west-2a - cache_stopped_nodes: false + cache_stopped_nodes: False # How Ray will authenticate with newly launched nodes. auth: @@ -50,10 +43,10 @@ auth: # For more documentation on available fields, see: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances head_node: - InstanceType: <<>> + InstanceType: p3.16xlarge ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) - # You can provision additional disk space with a conf as follows + # Set primary volume to 25 GiB BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: @@ -66,12 +59,19 @@ head_node: # For more documentation on available fields, see: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances worker_nodes: - InstanceType: <<>> + InstanceType: m4.16xlarge ImageId: ami-07728e9e2742b0662 # Deep Learning AMI (Ubuntu 16.04) + + # Set primary volume to 25 GiB + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + # Run workers on spot by default. Comment this out to use on-demand. -# InstanceMarketOptions: -# MarketType: spot + # InstanceMarketOptions: + # MarketType: spot # Additional options can be found in the boto docs, e.g. # SpotOptions: # MaxPrice: MAX_HOURLY_PRICE @@ -87,17 +87,13 @@ file_mounts: { # List of shell commands to run to set up nodes. setup_commands: - - wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<>>/<<>>/ray-<<>>-<<>>-manylinux1_x86_64.whl - - source activate tensorflow_p36 && pip install -U ray-<<>>-<<>>-manylinux1_x86_64.whl + - wget --quiet https://s3-us-west-2.amazonaws.com/ray-wheels/releases/{{ray_version}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl + - source activate tensorflow_p36 && pip install -U ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] - # Consider uncommenting these if you also want to run apt-get commands during setup - # - sudo pkill -9 apt-get || true - # - sudo pkill -9 dpkg || true - # - sudo dpkg --configure -a + - source activate tensorflow_p36 && pip install boto3==1.4.8 cython==0.29.0 # Custom commands that will be run on the head node after common setup. -head_setup_commands: - - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions +head_setup_commands: [] # Custom commands that will be run on worker nodes after common setup. worker_setup_commands: [] diff --git a/ci/regression_test/rllib_stress_tests/ray-project/project.yaml b/ci/regression_test/rllib_stress_tests/ray-project/project.yaml new file mode 100644 index 000000000..48f6685ff --- /dev/null +++ b/ci/regression_test/rllib_stress_tests/ray-project/project.yaml @@ -0,0 +1,49 @@ +# This file is generated by `ray project create`. + +name: rllib_stress_tests + +# description: A short description of the project. +# The URL of the repo this project is part of. +# repo: ... + +cluster: + config: ray-project/cluster.yaml + params: + - name: ray_version # Ray version string. + default: "0.8.2" + + - name: commit # Ray commit SHA string. + default: "f5a1307a608fe5fdbdb04616b22c91f029af329a" + + +environment: + # dockerfile: The dockerfile to be built and ran the commands with. + # dockerimage: The docker image to be used to run the project in, e.g. ubuntu:18.04. + requirements: ray-project/requirements.txt + + shell: # Shell commands to be ran for environment setup. + - echo "Setting up the environment" + +commands: + - name: check-load + command: uptime + help: "Check load of the workload." + + - name: show-output + command: tmux capture-pane -p + help: "Show tail of the workoad output." + + - name: run-impala + command: bash run.sh + help: "Run impala stress test" + +# Pathnames for files and directories that should be saved +# in a snapshot but that should not be synced with a# session. Pathnames can be relative to the project +# directory or absolute. Generally, this should be files +# that were created by an active session, such as +# application checkpoints and logs. +output_files: [ + # For example, uncomment this to save the logs from the + # last ray job. + # "/tmp/ray/session_latest", +] \ No newline at end of file diff --git a/ci/regression_test/rllib_stress_tests/ray-project/requirements.txt b/ci/regression_test/rllib_stress_tests/ray-project/requirements.txt new file mode 100644 index 000000000..69bde8cf2 --- /dev/null +++ b/ci/regression_test/rllib_stress_tests/ray-project/requirements.txt @@ -0,0 +1 @@ +ray[rllib] \ No newline at end of file diff --git a/ci/regression_test/rllib_stress_tests/run.sh b/ci/regression_test/rllib_stress_tests/run.sh new file mode 100644 index 000000000..91229ca52 --- /dev/null +++ b/ci/regression_test/rllib_stress_tests/run.sh @@ -0,0 +1,6 @@ + +source activate tensorflow_p36 + +python3 wait_cluster.py + +rllib train -f atari_impala_xlarge.yaml --ray-address=auto --queue-trials \ No newline at end of file diff --git a/ci/regression_test/rllib_stress_tests/wait_cluster.py b/ci/regression_test/rllib_stress_tests/wait_cluster.py new file mode 100644 index 000000000..e84485437 --- /dev/null +++ b/ci/regression_test/rllib_stress_tests/wait_cluster.py @@ -0,0 +1,10 @@ +import ray +import time + +ray.init(address="auto") + +curr_nodes = 0 +while not curr_nodes > 8: + print("Waiting for more nodes to come up: {}/{}".format(curr_nodes, 8)) + curr_nodes = len(ray.nodes()) + time.sleep(5) diff --git a/ci/stress_tests/ray-project/cluster.yaml b/ci/regression_test/stress_tests/ray-project/cluster.yaml similarity index 98% rename from ci/stress_tests/ray-project/cluster.yaml rename to ci/regression_test/stress_tests/ray-project/cluster.yaml index 8819b906b..c1e811b4a 100644 --- a/ci/stress_tests/ray-project/cluster.yaml +++ b/ci/regression_test/stress_tests/ray-project/cluster.yaml @@ -98,7 +98,7 @@ setup_commands: # - ray/ci/travis/install-bazel.sh - pip install boto3==1.4.8 cython==0.29.0 # - cd ray/python; git checkout master; git pull; pip install -e . --verbose - - "pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/{{ray_version}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl" + - "pip install https://s3-us-west-2.amazonaws.com/ray-wheels/{{ray_branch}}/{{commit}}/ray-{{ray_version}}-cp36-cp36m-manylinux1_x86_64.whl" # Custom commands that will be run on the head node after common setup. head_setup_commands: [] diff --git a/ci/stress_tests/ray-project/project.yaml b/ci/regression_test/stress_tests/ray-project/project.yaml similarity index 87% rename from ci/stress_tests/ray-project/project.yaml rename to ci/regression_test/stress_tests/ray-project/project.yaml index 5f88ff230..2324eacdb 100644 --- a/ci/stress_tests/ray-project/project.yaml +++ b/ci/regression_test/stress_tests/ray-project/project.yaml @@ -6,11 +6,13 @@ cluster: config: ray-project/cluster.yaml params: - name: ray_version # Ray version string. - default: "0.8.1" + default: "0.8.2" - name: commit # Ray commit SHA string. - default: "38ec2e70524a277d5aea307f6c843065ff982da5" + default: "f5a1307a608fe5fdbdb04616b22c91f029af329a" + - name: ray_branch + default: "releases/0.8.2" commands: - name: test_many_tasks diff --git a/ci/stress_tests/ray-project/requirements.txt b/ci/regression_test/stress_tests/ray-project/requirements.txt similarity index 100% rename from ci/stress_tests/ray-project/requirements.txt rename to ci/regression_test/stress_tests/ray-project/requirements.txt diff --git a/ci/stress_tests/test_dead_actors.py b/ci/regression_test/stress_tests/test_dead_actors.py similarity index 100% rename from ci/stress_tests/test_dead_actors.py rename to ci/regression_test/stress_tests/test_dead_actors.py diff --git a/ci/stress_tests/test_many_tasks.py b/ci/regression_test/stress_tests/test_many_tasks.py similarity index 100% rename from ci/stress_tests/test_many_tasks.py rename to ci/regression_test/stress_tests/test_many_tasks.py diff --git a/ci/stress_tests/.gitignore b/ci/stress_tests/.gitignore deleted file mode 100644 index 3f2531080..000000000 --- a/ci/stress_tests/.gitignore +++ /dev/null @@ -1,4 +0,0 @@ -*.log -*temporary.yaml -rllib_impala_p36.yaml -sgd_p36.yaml diff --git a/ci/stress_tests/run_application_stress_tests.sh b/ci/stress_tests/run_application_stress_tests.sh deleted file mode 100755 index f4df1ef73..000000000 --- a/ci/stress_tests/run_application_stress_tests.sh +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env bash - -# This script should be run as follows: -# ./run_application_stress_tests.sh -# For example, might be 0.7.1 -# and might be bc3b6efdb6933d410563ee70f690855c05f25483. The commit -# should be the latest commit on the branch "releases/". - -# This script runs all of the application tests. -# Currently includes an IMPALA stress test and a SGD stress test on Python 3.6. -# All tests use a separate cluster, and each cluster -# will be destroyed upon test completion (or failure). - -# Note that if the environment variable DEBUG_MODE is detected, -# the clusters will not be automatically shut down after the test runs. - -# This script will exit with code 1 if the test did not run successfully. - -# Show explicitly which commands are currently running. This should only be AFTER -# the private key is placed. -set -x - -ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -RESULT_FILE=$ROOT_DIR/"results-$(date '+%Y-%m-%d_%H-%M-%S').log" - -touch "$RESULT_FILE" -echo "Logging to" "$RESULT_FILE" - -if [[ -z "$1" ]]; then - echo "ERROR: The first argument must be the Ray version string." - exit 1 -else - RAY_VERSION=$1 -fi - -if [[ -z "$2" ]]; then - echo "ERROR: The second argument must be the commit hash to test." - exit 1 -else - RAY_COMMIT=$2 -fi - -echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT." -echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_COMMIT/" - -# This function identifies the right string for the Ray wheel. -_find_wheel_str(){ - local python_version=$1 - # echo "PYTHON_VERSION", $python_version - local wheel_str="" - if [ "$python_version" == "p27" ]; then - wheel_str="cp27-cp27mu" - else - wheel_str="cp36-cp36m" - fi - echo $wheel_str -} - -# Total time is roughly 25 minutes. -# Actual test runtime is roughly 10 minutes. -test_impala(){ - local PYTHON_VERSION=$1 - local WHEEL_STR=$(_find_wheel_str "$PYTHON_VERSION") - - pushd "$ROOT_DIR" - local TEST_NAME="rllib_impala_$PYTHON_VERSION" - local CLUSTER="$TEST_NAME.yaml" - echo "Creating IMPALA cluster YAML from template." - - cat application_cluster_template.yaml | - sed -e " - s/<<>>/$RAY_VERSION/g; - s/<<>>/$RAY_COMMIT/; - s/<<>>/$TEST_NAME/; - s/<<>>/p3.16xlarge/; - s/<<>>/m4.16xlarge/; - s/<<>>/9/; - s/<<>>/9/; - s/<<>>/$PYTHON_VERSION/; - s/<<>>/$WHEEL_STR/;" > "$CLUSTER" - - echo "Try running IMPALA stress test." - { - RLLIB_DIR=../../python/ray/rllib/ - ray --logging-level=DEBUG up -y "$CLUSTER" && - ray rsync_up "$CLUSTER" $RLLIB_DIR/tuned_examples/ tuned_examples/ && - # HACK: the test will deadlock if it scales up slowly, so we have to wait - # for the cluster to be fully launched first. This is because the first - # trial will occupy all the CPU slots if it can, preventing GPU access. - sleep 200 && - ray --logging-level=DEBUG exec "$CLUSTER" "source activate tensorflow_p36 && rllib train -f tuned_examples/atari-impala-large.yaml --ray-address='localhost:6379' --queue-trials" && - echo "PASS: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE" - } || echo "FAIL: IMPALA Test for" "$PYTHON_VERSION" >> "$RESULT_FILE" - - # Tear down cluster. - if [ "$DEBUG_MODE" = "" ]; then - ray down -y "$CLUSTER" - rm "$CLUSTER" - else - echo "Not tearing down cluster" "$CLUSTER" - fi - popd -} - -# Total runtime is about 20 minutes (if the AWS spot instance order is fulfilled). -# Actual test runtime is roughly 10 minutes. -test_sgd(){ - local PYTHON_VERSION=$1 - local WHEEL_STR=$(_find_wheel_str $PYTHON_VERSION) - - pushd "$ROOT_DIR" - local TEST_NAME="sgd_$PYTHON_VERSION" - local CLUSTER="$TEST_NAME.yaml" - echo "Creating SGD cluster YAML from template." - - cat application_cluster_template.yaml | - sed -e " - s/<<>>/$RAY_VERSION/g; - s/<<>>/$RAY_COMMIT/; - s/<<>>/$TEST_NAME/; - s/<<>>/p3.16xlarge/; - s/<<>>/p3.16xlarge/; - s/<<>>/3/; - s/<<>>/3/; - s/<<>>/$PYTHON_VERSION/; - s/<<>>/$WHEEL_STR/;" > "$CLUSTER" - - echo "Try running SGD stress test." - { - SGD_DIR=$ROOT_DIR/../../python/ray/util/sgd/ - ray --logging-level=DEBUG up -y "$CLUSTER" && - # TODO: fix submit so that args work - ray rsync_up "$CLUSTER" "$SGD_DIR/mnist_example.py" mnist_example.py && - sleep 1 && - ray --logging-level=DEBUG exec "$CLUSTER" " - python mnist_example.py --address=localhost:6379 --num-iters=2000 --num-workers=8 --devices-per-worker=2 --gpu" && - echo "PASS: SGD Test for" "$PYTHON_VERSION" >> "$RESULT_FILE" - } || echo "FAIL: SGD Test for" "$PYTHON_VERSION" >> "$RESULT_FILE" - - # Tear down cluster. - if [ "$DEBUG_MODE" = "" ]; then - ray down -y "$CLUSTER" - rm "$CLUSTER" - else - echo "Not tearing down cluster" "$CLUSTER" - fi - popd -} - -# RUN TESTS -for PYTHON_VERSION in "p36" -do - test_impala $PYTHON_VERSION -done - -cat "$RESULT_FILE" -cat "$RESULT_FILE" | grep FAIL > test.log -[ ! -s test.log ] || exit 1 diff --git a/ci/stress_tests/run_jenkins_stress_test.sh b/ci/stress_tests/run_jenkins_stress_test.sh deleted file mode 100755 index fd0bc0f9f..000000000 --- a/ci/stress_tests/run_jenkins_stress_test.sh +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env bash - -# Cause the script to exit if a single command fails. -set -e - -# Show explicitly which commands are currently running. -set -x - -MEMORY_SIZE="20G" -SHM_SIZE="20G" - -docker build -q --no-cache -t ray-project/base-deps docker/base-deps - -# Add Ray source -git rev-parse HEAD > ./docker/stress_test/git-rev -git archive -o ./docker/stress_test/ray.tar $(git rev-parse HEAD) -DOCKER_SHA=$(docker build --no-cache -q -t ray-project/stress_test docker/stress_test) - -echo "Using Docker image" $DOCKER_SHA -docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 \ - -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e RAY_AWS_SSH_KEY \ - $DOCKER_SHA \ - bash /ray/ci/stress_tests/run_stress_tests.sh - -# docker run --rm --shm-size=${SHM_SIZE} --memory=${MEMORY_SIZE} --memory-swap=-1 \ -# -e AWS_ACCESS_KEY_ID -e AWS_SECRET_ACCESS_KEY -e RAY_AWS_SSH_KEY \ -# $DOCKER_SHA \ -# bash /ray/ci/stress_tests/run_application_stress_tests.sh diff --git a/ci/stress_tests/run_stress_tests.sh b/ci/stress_tests/run_stress_tests.sh deleted file mode 100755 index f92e8c592..000000000 --- a/ci/stress_tests/run_stress_tests.sh +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/env bash - -# Show explicitly which commands are currently running. -set -x - -ROOT_DIR=$(cd "$(dirname "${BASH_SOURCE:-$0}")"; pwd) -RESULT_FILE=$ROOT_DIR/results-$(date '+%Y-%m-%d_%H-%M-%S').log - -touch "$RESULT_FILE" -echo "Logging to" "$RESULT_FILE" - -if [[ -z "$1" ]]; then - echo "ERROR: The first argument must be the Ray version string." - exit 1 -else - RAY_VERSION=$1 -fi - -if [[ -z "$2" ]]; then - echo "ERROR: The second argument must be the commit hash to test." - exit 1 -else - RAY_COMMIT=$2 -fi - -echo "Testing ray==$RAY_VERSION at commit $RAY_COMMIT." -echo "The wheels used will live under https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_COMMIT/" - -run_test(){ - local test_name=$1 - - local CLUSTER="stress_testing_config_temporary.yaml" - - cat stress_testing_config.yaml | - sed -e " - s/<<>>/$RAY_VERSION/g; - s/<<>>/$RAY_COMMIT/;" > "$CLUSTER" - - echo "Try running $test_name." - { - ray up -y $CLUSTER --cluster-name "$test_name" && - sleep 1 && - ray --logging-level=DEBUG submit "$CLUSTER" --cluster-name "$test_name" "$test_name.py" - } || echo "FAIL: $test_name" >> "$RESULT_FILE" - - # Tear down cluster. - if [ "$DEBUG_MODE" = "" ]; then - ray down -y $CLUSTER --cluster-name "$test_name" - rm "$CLUSTER" - else - echo "Not tearing down cluster" "$CLUSTER" - fi -} - -pushd "$ROOT_DIR" - run_test test_many_tasks - run_test test_dead_actors -popd - -cat "$RESULT_FILE" -[ ! -s "$RESULT_FILE" ] || exit 1 diff --git a/ci/stress_tests/stress_testing_config.yaml b/ci/stress_tests/stress_testing_config.yaml deleted file mode 100644 index e3b6e1134..000000000 --- a/ci/stress_tests/stress_testing_config.yaml +++ /dev/null @@ -1,117 +0,0 @@ -#################################################################### -# All nodes in this cluster will auto-terminate in 1 hour -#################################################################### - -# An unique identifier for the head node and workers of this cluster. -cluster_name: stress-testing - -# The minimum number of workers nodes to launch in addition to the head -# node. This number should be >= 0. -min_workers: 105 - -# The maximum number of workers nodes to launch in addition to the head -# node. This takes precedence over min_workers. -max_workers: 105 - -# The autoscaler will scale up the cluster to this target fraction of resource -# usage. For example, if a cluster of 10 nodes is 100% busy and -# target_utilization is 0.8, it would resize the cluster to 13. This fraction -# can be decreased to increase the aggressiveness of upscaling. -# This value must be less than 1.0 for scaling to happen. -target_utilization_fraction: 0.8 - -# If a node is idle for this many minutes, it will be removed. -idle_timeout_minutes: 5 - -# Cloud-provider specific configuration. -provider: - type: aws - region: us-west-2 - availability_zone: us-west-2a - cache_stopped_nodes: False - -# How Ray will authenticate with newly launched nodes. -auth: - ssh_user: ubuntu -# By default Ray creates a new private keypair, but you can also use your own. -# If you do so, make sure to also set "KeyName" in the head and worker node -# configurations below. -# ssh_private_key: /path/to/your/key.pem - -# Provider-specific config for the head node, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances -head_node: - InstanceType: m4.16xlarge - ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04 - - # Set primary volume to 25 GiB - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - - # Additional options in the boto docs. - -# Provider-specific config for worker nodes, e.g. instance type. By default -# Ray will auto-configure unspecified fields such as SubnetId and KeyName. -# For more documentation on available fields, see: -# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances -worker_nodes: - InstanceType: m4.large - ImageId: ami-06d51e91cea0dac8d # Ubuntu 18.04 - - # Set primary volume to 25 GiB - BlockDeviceMappings: - - DeviceName: /dev/sda1 - Ebs: - VolumeSize: 100 - - # Run workers on spot by default. Comment this out to use on-demand. - InstanceMarketOptions: - MarketType: spot - # Additional options can be found in the boto docs, e.g. - # SpotOptions: - # MaxPrice: MAX_HOURLY_PRICE - - # Additional options in the boto docs. - -# Files or directories to copy to the head and worker nodes. The format is a -# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. -file_mounts: { -# "/path1/on/remote/machine": "/path1/on/local/machine", -# "/path2/on/remote/machine": "/path2/on/local/machine", -} - -# List of shell commands to run to set up nodes. -setup_commands: - # Uncomment these if you want to build ray from source. - # - sudo apt-get -qq update - # - sudo apt-get install -y build-essential curl unzip - # Install Anaconda. - - wget --quiet https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true - - bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true - - echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc - # # Build Ray. - # - git clone https://github.com/ray-project/ray || true - # - ray/ci/travis/install-bazel.sh - - pip install boto3==1.4.8 cython==0.29.0 - # - cd ray/python; git checkout master; git pull; pip install -e . --verbose - - pip install https://s3-us-west-2.amazonaws.com/ray-wheels/releases/<<>>/<<>>/ray-<<>>-cp36-cp36m-manylinux1_x86_64.whl - -# Custom commands that will be run on the head node after common setup. -head_setup_commands: [] - -# Custom commands that will be run on worker nodes after common setup. -worker_setup_commands: [] - -# Command to start ray on the head node. You don't need to change this. -head_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --head --num-redis-shards=5 --redis-port=6379 --autoscaling-config=~/ray_bootstrap_config.yaml - -# Command to start ray on worker nodes. You don't need to change this. -worker_start_ray_commands: - - ray stop - - ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 --num-gpus=100 diff --git a/doc/dev/RELEASE_PROCESS.rst b/doc/dev/RELEASE_PROCESS.rst index 98c2fa044..ced6d2d81 100644 --- a/doc/dev/RELEASE_PROCESS.rst +++ b/doc/dev/RELEASE_PROCESS.rst @@ -24,9 +24,10 @@ This document describes the process for creating new releases. For a new micro release (e.g., 0.7.1): No action is required. -4. **Testing:** Before releasing, the following sets of tests should be run. The results - of each of these tests for previous releases are checked in under ``doc/dev/release_tests``, - and should be compared against to identify any regressions. +4. **Testing:** Before releasing, the following sets of tests should be run. + The results of each of these tests for previous releases are checked in + under ``doc/dev/release_tests``, and should be compared against to identify + any regressions. 1. Long-running tests @@ -38,16 +39,16 @@ This document describes the process for creating new releases. These tests should run for at least 24 hours (printing new iterations and CPU load stable in the AWS console). - The last hundred lines or so printed by each test should be checked in under - ``doc/dev/release_logs/``. + 2. Multi-node regression tests - 2. Stress tests + Follow the same instruction as long running stress tests. The large scale distributed + regression tests identify potential performance regression in distributed environment. + The following test should be ran: - .. code-block:: bash - - ray/ci/stress_tests/run_stress_tests.sh - ray/ci/stress_tests/run_application_stress_tests.sh - rllib train -f rllib/tuned_examples/compact-regression-test.yaml + - ``ci/regression_test/rllib_regression-tests`` run the compact regression test for rllib. + - ``ci/regression_test/rllib_stress_tests`` run multinode 8hr IMPALA trial. + - ``ci/regression_test/stress_tests`` contains two tests: ``many_tasks`` and ``dead_actors``. + Each of the test runs on 105 spot instances. Make sure that these pass. For the RLlib regression tests, see the comment on the file for the pass criteria. For the rest, it will be obvious if they passed. @@ -59,12 +60,9 @@ This document describes the process for creating new releases. 3. Microbenchmarks - .. code-block:: bash - - ray microbenchmark - - Run `ray microbenchmark` on an `m4.16xl` instance running `Ubuntu 18.04` with `Python 3` to get the - latest microbenchmark numbers. + Run the ``ci/microbenchmark`` with the commit. Under the hood, the session will + run `ray microbenchmark` on an `m4.16xl` instance running `Ubuntu 18.04` with `Python 3` + to get the latest microbenchmark numbers. The results should be checked in under ``doc/dev/release_logs/``. @@ -82,10 +80,11 @@ This document describes the process for creating new releases. changes/updates/bugfixes and their PR numbers. Once you have a draft, send it out to other Ray developers (especially those who contributed heavily during this release) for feedback. At the end of the release note, you should also - add a list of contributors. + add a list of contributors. Make sure Ray, Tune, RLLib, Autoscaler are + capitalized correctly. Run ``doc/dev/get_contributors.py`` to generate the list of commits corresponding - to this release and the formatted list of contributors. + to this release and the formatted list of contributors. You will need to provide a GitHub personal access token (github.com -> settings -> developer settings -> personal access tokens). @@ -107,14 +106,16 @@ This document describes the process for creating new releases. export RAY_HASH=... # e.g., 618147f57fb40368448da3b2fb4fd213828fa12b export RAY_VERSION=... # e.g., 0.7.0 - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp27-cp27mu-manylinux1_x86_64.whl + + # Linux Wheels pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-manylinux1_x86_64.whl pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-manylinux1_x86_64.whl pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-manylinux1_x86_64.whl - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp27-cp27m-macosx_10_6_intel.whl - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_6_intel.whl - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_6_intel.whl - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_6_intel.whl + + # Mac Wheels + pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_13_intel.whl + pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_13_intel.whl + pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_13_intel.whl 8. **Upload to PyPI Test:** Upload the wheels to the PyPI test site using ``twine``. @@ -164,9 +165,14 @@ This document describes the process for creating new releases. pip install -U ray -10. **Improve the release process:** Find some way to improve the release +10. **Create a point release on readthedocs page:** In the `read the docs project page`_, + mark the release branch as "active" so there is a point release for the documentation. + Add @richardliaw to add you if you don't have access. + +11. **Improve the release process:** Find some way to improve the release process so that whoever manages the release next will have an easier time. .. _`sample PR for bumping a minor release version`: https://github.com/ray-project/ray/pull/6303 .. _`sample commit for bumping the release branch version`: https://github.com/ray-project/ray/commit/a39325d818339970e51677708d5596f4b8f790ce .. _`GitHub release`: https://github.com/ray-project/ray/releases +.. _`read the docs project page`: https://readthedocs.org/projects/ray/ diff --git a/doc/dev/download_wheels.sh b/doc/dev/download_wheels.sh index 79b0cb134..ad1a3b456 100644 --- a/doc/dev/download_wheels.sh +++ b/doc/dev/download_wheels.sh @@ -1,6 +1,6 @@ wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-manylinux1_x86_64.whl wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-manylinux1_x86_64.whl wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-manylinux1_x86_64.whl -wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_6_intel.whl -wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_6_intel.whl -wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_6_intel.whl +wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp35-cp35m-macosx_10_13_intel.whl +wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp36-cp36m-macosx_10_13_intel.whl +wget https://s3-us-west-2.amazonaws.com/ray-wheels/releases/$RAY_VERSION/$RAY_HASH/ray-$RAY_VERSION-cp37-cp37m-macosx_10_13_intel.whl diff --git a/doc/dev/release_logs/0.8.2/microbenchmark.txt b/doc/dev/release_logs/0.8.2/microbenchmark.txt new file mode 100644 index 000000000..674de2477 --- /dev/null +++ b/doc/dev/release_logs/0.8.2/microbenchmark.txt @@ -0,0 +1,18 @@ +# NOTE: Make sure to run this with OMP_NUM_THREADS=64, otherwise the put gigabytes per +# seconds will be reduced. Put latency was reduced due to extra ipc call to raylet +# for ref counting. + +single client get calls per second 11743.14 +- 2062.85 +single client put calls per second 3133.08 +- 89.81 +single client put gigabytes per second 10.33 +- 7.96 +multi client put calls per second 3590.16 +- 22.04 +multi client put gigabytes per second 23.38 +- 0.63 +single client tasks sync per second 1263.59 +- 63.16 +single client tasks async per second 13959.14 +- 393.16 +multi client tasks async per second 42285.81 +- 238.55 +1:1 actor calls sync per second 2159.21 +- 112.97 +1:1 actor calls async per second 7048.53 +- 63.8 +1:1 actor calls concurrent per second 6167.01 +- 75.67 +1:n actor calls async per second 12241.67 +- 62.13 +n:n actor calls async per second 41766.33 +- 672.14 +n:n actor calls with arg async per second 13134.22 +- 71.68 \ No newline at end of file diff --git a/doc/dev/release_logs/0.8.2/rllib_regression.txt b/doc/dev/release_logs/0.8.2/rllib_regression.txt new file mode 100644 index 000000000..2fa3a9651 --- /dev/null +++ b/doc/dev/release_logs/0.8.2/rllib_regression.txt @@ -0,0 +1,36 @@ +== Status == +Memory usage on this node: 43.4/480.3 GiB +Using FIFO scheduling algorithm. +Resources requested: 0/64 CPUs, 0.0/8 GPUs, 0.0/440.23 GiB heap, 0.0/12.84 GiB objects +Result logdir: /home/ubuntu/ray_results/apex +Result logdir: /home/ubuntu/ray_results/atari-a2c +Result logdir: /home/ubuntu/ray_results/atari-basic-dqn +Result logdir: /home/ubuntu/ray_results/atari-impala +Result logdir: /home/ubuntu/ray_results/atari-ppo-tf +Result logdir: /home/ubuntu/ray_results/atari-ppo-torch +Number of trials: 24 (24 TERMINATED) +Table truncated to 20 rows. 4 trials (4 TERMINATED) not shown. ++--------------------------------------+------------+-------+----------+------------------+---------+--------+ +| Trial name | status | loc | reward | total time (s) | ts | iter | +|--------------------------------------+------------+-------+----------+------------------+---------+--------| +| A2C_BreakoutNoFrameskip-v4_c8ad5a48 | TERMINATED | | 139.19 | 3606.77 | 3686000 | 352 | +| A2C_BreakoutNoFrameskip-v4_c8ad1c54 | TERMINATED | | 75.56 | 3601.57 | 2932000 | 349 | +| A2C_BreakoutNoFrameskip-v4_c8acd28a | TERMINATED | | 131.97 | 3603.39 | 2928000 | 349 | +| A2C_BreakoutNoFrameskip-v4_c8ac8d16 | TERMINATED | | 105.42 | 3601.03 | 2901500 | 349 | +| DQN_BreakoutNoFrameskip-v4_c8af8a02 | TERMINATED | | 15.81 | 3665.65 | 270000 | 27 | +| DQN_BreakoutNoFrameskip-v4_c8af079e | TERMINATED | | 11.32 | 3612.1 | 270000 | 27 | +| APEX_BreakoutNoFrameskip-v4_c8ac4694 | TERMINATED | | 50.56 | 3627.89 | 5786880 | 115 | +| DQN_BreakoutNoFrameskip-v4_c8ae61ae | TERMINATED | | 7.14 | 3620.61 | 270000 | 27 | +| DQN_BreakoutNoFrameskip-v4_c8adbcea | TERMINATED | | 11.24 | 3640.35 | 270000 | 27 | +| APEX_BreakoutNoFrameskip-v4_c8abef3c | TERMINATED | | 94.5 | 3625.19 | 5820800 | 115 | +| PPO_BreakoutNoFrameskip-v4_c8ab0572 | TERMINATED | | 25.26 | 3603.23 | 1335000 | 267 | +| PPO_BreakoutNoFrameskip-v4_c8aabf36 | TERMINATED | | 18.2 | 3603.36 | 1300000 | 260 | +| APEX_BreakoutNoFrameskip-v4_c8abaa86 | TERMINATED | | 90.98 | 3627.03 | 7350400 | 116 | +| PPO_BreakoutNoFrameskip-v4_c8aa6f5e | TERMINATED | | 17.01 | 3611.01 | 1555000 | 311 | +| PPO_BreakoutNoFrameskip-v4_c8aa27e2 | TERMINATED | | 22.41 | 3609.64 | 1545000 | 309 | +| PPO_BreakoutNoFrameskip-v4_c8a9e39a | TERMINATED | | 61.25 | 3602.17 | 4475000 | 895 | +| PPO_BreakoutNoFrameskip-v4_c8a97978 | TERMINATED | | 28.19 | 3601.33 | 4415000 | 883 | +| PPO_BreakoutNoFrameskip-v4_c8a904ca | TERMINATED | | 41.3 | 3600.42 | 4515000 | 903 | +| APEX_BreakoutNoFrameskip-v4_c8ab5108 | TERMINATED | | 62.46 | 3626.37 | 5091840 | 114 | +| PPO_BreakoutNoFrameskip-v4_c8a88004 | TERMINATED | | 60.44 | 3602.52 | 3380000 | 676 | ++--------------------------------------+------------+-------+----------+------------------+---------+--------+ diff --git a/doc/dev/release_logs/0.8.2/stress_tests/application_stress_test.txt b/doc/dev/release_logs/0.8.2/stress_tests/application_stress_test.txt new file mode 100644 index 000000000..62d1aa7c4 --- /dev/null +++ b/doc/dev/release_logs/0.8.2/stress_tests/application_stress_test.txt @@ -0,0 +1,14 @@ +== Status == +Memory usage on this node: 34.6/480.3 GiB +Using FIFO scheduling algorithm. +Resources requested: 0/640 CPUs, 0/8 GPUs, 0.0/2541.21 GiB heap, 0.0/128.42 GiB objects +Result logdir: /home/ubuntu/ray_results/atari-impala +Number of trials: 4 (4 TERMINATED) ++---------------------------------------------+------------+-------+-----------------------------+----------+------------------+----------+--------+ +| Trial name | status | loc | env | reward | total time (s) | ts | iter | +|---------------------------------------------+------------+-------+-----------------------------+----------+------------------+----------+--------| +| IMPALA_BreakoutNoFrameskip-v4_2565545c | TERMINATED | | BreakoutNoFrameskip-v4 | 451.07 | 22555.3 | 30039500 | 381 | +| IMPALA_BeamRiderNoFrameskip-v4_2565e804 | TERMINATED | | BeamRiderNoFrameskip-v4 | 3124.8 | 24121.2 | 30057000 | 408 | +| IMPALA_QbertNoFrameskip-v4_256671de | TERMINATED | | QbertNoFrameskip-v4 | 8388.25 | 25163.5 | 30080000 | 453 | +| IMPALA_SpaceInvadersNoFrameskip-v4_256725ac | TERMINATED | | SpaceInvadersNoFrameskip-v4 | 780.65 | 23148.1 | 30026500 | 384 | ++---------------------------------------------+------------+-------+-----------------------------+----------+------------------+----------+--------+ \ No newline at end of file diff --git a/doc/dev/release_logs/0.8.2/stress_tests/test_dead_actors.txt b/doc/dev/release_logs/0.8.2/stress_tests/test_dead_actors.txt new file mode 100644 index 000000000..cab48a589 --- /dev/null +++ b/doc/dev/release_logs/0.8.2/stress_tests/test_dead_actors.txt @@ -0,0 +1,4 @@ +Finished in: 98.49777579307556s +Average iteration time: 0.9849753308296204s +Max iteration time: 2.9459526538848877s +Min iteration time: 0.08075928688049316s \ No newline at end of file diff --git a/doc/dev/release_logs/0.8.2/stress_tests/test_many_tasks.txt b/doc/dev/release_logs/0.8.2/stress_tests/test_many_tasks.txt new file mode 100644 index 000000000..be7b57a14 --- /dev/null +++ b/doc/dev/release_logs/0.8.2/stress_tests/test_many_tasks.txt @@ -0,0 +1,15 @@ +Stage 0 results: + Total time: 22.579216480255127 +Stage 1 results: + Total time: 154.41431832313538 + Average iteration time: 15.441423058509827 + Max iteration time: 15.943994760513306 + Min iteration time: 15.029884099960327 +Stage 2 results: + Total time: 646.7662391662598 + Average iteration time: 129.35279755592347 + Max iteration time: 134.80017256736755 + Min iteration time: 121.44297170639038 +Stage 3 results: + Actor creation time: 0.0635519027709961 + Total time: 3464.0461547374725 \ No newline at end of file