diff --git a/ci/long_running_tests/.gitignore b/ci/long_running_tests/.gitignore new file mode 100644 index 000000000..573e4c209 --- /dev/null +++ b/ci/long_running_tests/.gitignore @@ -0,0 +1 @@ +config_temporary.yaml diff --git a/ci/long_running_tests/config.yaml b/ci/long_running_tests/config.yaml index e623d4ea6..fd6722951 100644 --- a/ci/long_running_tests/config.yaml +++ b/ci/long_running_tests/config.yaml @@ -14,23 +14,19 @@ auth: head_node: InstanceType: m5.xlarge - ImageId: ami-0def3275 # Default Ubuntu 16.04 AMI. - - # Set primary volume to 25 GiB + ImageId: ami-0888a3b5189309429 # DLAMI 7/1/19 BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: - VolumeSize: 50 + VolumeSize: 150 worker_nodes: InstanceType: m5.large - ImageId: ami-0def3275 # Default Ubuntu 16.04 AMI. - - # Set primary volume to 25 GiB + ImageId: ami-0888a3b5189309429 # DLAMI 7/1/19 BlockDeviceMappings: - DeviceName: /dev/sda1 Ebs: - VolumeSize: 50 + VolumeSize: 150 # Run workers on spot by default. Comment this out to use on-demand. InstanceMarketOptions: @@ -38,24 +34,15 @@ worker_nodes: # List of shell commands to run to set up nodes. setup_commands: - - sudo apt-get update - - sudo apt-get install -y build-essential curl unzip - # Install Anaconda. - - wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh || true - - bash Anaconda3-5.0.1-Linux-x86_64.sh -b -p $HOME/anaconda3 || true - - echo 'export PATH="$HOME/anaconda3/bin:$PATH"' >> ~/.bashrc - - echo 'termcapinfo xterm* ti@:te@' >> ~/.screenrc - # Some Python dependencies. - - pip install boto3==1.4.8 cython==0.29.0 - # Uncomment the following if you wish to install Ray instead. + # Install nightly Ray wheels. + - source activate tensorflow_p36 && pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/<<>>/<<>>/ray-<<>>-cp36-cp36m-manylinux1_x86_64.whl + - source activate tensorflow_p36 && pip install ray[rllib] ray[debug] gym[atari] + # Uncomment the following if you wish to build Ray instead. + # - sudo apt-get update + # - sudo apt-get install -y build-essential curl unzip # - git clone https://github.com/ray-project/ray || true # - ray/ci/travis/install-bazel.sh - # - cd ray/python; git checkout master; git pull; pip install -e . --verbose - # Install nightly Ray wheels. - - wget https://s3-us-west-2.amazonaws.com/ray-wheels/<<>>/<<>>/ray-<<>>-cp36-cp36m-manylinux1_x86_64.whl - - pip install ray-<<>>-cp36-cp36m-manylinux1_x86_64.whl[rllib,debug] - - pip install tensorflow - - pip install -U dask # fix error importing lz4 + # - cd ray/python; git checkout master; git pull; source activate tensorflow_p36 && pip install -e . --verbose # Custom commands that will be run on the head node after common setup. head_setup_commands: [] diff --git a/ci/long_running_tests/start_workloads.sh b/ci/long_running_tests/start_workloads.sh index 1dda7f6d6..fde3db30f 100755 --- a/ci/long_running_tests/start_workloads.sh +++ b/ci/long_running_tests/start_workloads.sh @@ -57,7 +57,7 @@ for workload_file in "$ROOT_DIR"/workloads/*; do # Clean up previous runs if relevant. ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "ray stop; rm -r /tmp/ray; tmux kill-server | true" # Start the workload. - ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "python $file_name" --tmux + ray exec $CLUSTER_CONFIG --cluster-name="$workload_name" "source activate tensorflow_p36 && python $file_name" --tmux ) & done # Wait for child processes to finish. @@ -70,17 +70,6 @@ popd echo "" echo "" -echo "To kill the instances, use the following commands." -echo "" -for workload_file in "$ROOT_DIR"/workloads/*; do - file_name=$(basename -- "$workload_file") - workload_name="${file_name%.*}" - echo " ray down -y $ROOT_DIR/$CLUSTER_CONFIG --cluster-name=$workload_name" -done - -echo "" -echo "" - echo "Use the following commands to attach to the relevant drivers." echo "" for workload_file in "$ROOT_DIR"/workloads/*; do diff --git a/python/ray/autoscaler/updater.py b/python/ray/autoscaler/updater.py index d42bf041a..5c344ad31 100644 --- a/python/ray/autoscaler/updater.py +++ b/python/ray/autoscaler/updater.py @@ -226,15 +226,13 @@ class NodeUpdater(object): m = "{}: Initialization commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): - with open("/dev/null", "w") as redirect: - for cmd in self.initialization_commands: - self.ssh_cmd(cmd, redirect=redirect) + for cmd in self.initialization_commands: + self.ssh_cmd(cmd) m = "{}: Setup commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): - with open("/dev/null", "w") as redirect: - for cmd in self.setup_commands: - self.ssh_cmd(cmd, redirect=redirect) + for cmd in self.setup_commands: + self.ssh_cmd(cmd) def rsync_up(self, source, target, redirect=None, check_error=True): logger.info("NodeUpdater: "