[autoscaler] docker run options (#3921)

Adds support for docker options, allowing for use of nvidia-docker.

Closes #2657.
This commit is contained in:
Kristian Hartikainen
2019-02-13 12:26:28 -08:00
committed by Richard Liaw
parent 4347ab644e
commit 729d0b2825
10 changed files with 419 additions and 101 deletions
+27 -16
View File
@@ -90,6 +90,7 @@ CLUSTER_CONFIG_SCHEMA = {
{
"image": (str, OPTIONAL), # e.g. tensorflow/tensorflow:1.5.0-py3
"container_name": (str, OPTIONAL), # e.g., ray_docker
"run_options": (list, OPTIONAL),
},
OPTIONAL),
@@ -102,7 +103,12 @@ CLUSTER_CONFIG_SCHEMA = {
# Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
"file_mounts": (dict, OPTIONAL),
# List of common shell commands to run to initialize nodes.
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
"initialization_commands": (list, OPTIONAL),
# List of common shell commands to run to setup nodes.
"setup_commands": (list, OPTIONAL),
# Commands that will be run on the head node after common setup.
@@ -527,13 +533,16 @@ class StandardAutoscaler(object):
"{}: No heartbeat in {}s, "
"restarting Ray to recover...".format(node_id, delta))
updater = NodeUpdaterThread(
node_id,
self.config["provider"],
self.provider,
self.config["auth"],
self.config["cluster_name"], {},
with_head_node_ip(self.config["worker_start_ray_commands"]),
self.runtime_hash,
node_id=node_id,
provider_config=self.config["provider"],
provider=self.provider,
auth_config=self.config["auth"],
cluster_name=self.config["cluster_name"],
file_mounts={},
initialization_commands=[],
setup_commands=with_head_node_ip(
self.config["worker_start_ray_commands"]),
runtime_hash=self.runtime_hash,
process_runner=self.process_runner,
use_internal_ip=True)
updater.start()
@@ -561,14 +570,16 @@ class StandardAutoscaler(object):
def spawn_updater(self, node_id, init_commands):
updater = NodeUpdaterThread(
node_id,
self.config["provider"],
self.provider,
self.config["auth"],
self.config["cluster_name"],
self.config["file_mounts"],
with_head_node_ip(init_commands),
self.runtime_hash,
node_id=node_id,
provider_config=self.config["provider"],
provider=self.provider,
auth_config=self.config["auth"],
cluster_name=self.config["cluster_name"],
file_mounts=self.config["file_mounts"],
initialization_commands=with_head_node_ip(
self.config["initialization_commands"]),
setup_commands=with_head_node_ip(init_commands),
runtime_hash=self.runtime_hash,
process_runner=self.process_runner,
use_internal_ip=True)
updater.start()
+8 -2
View File
@@ -20,6 +20,7 @@ initial_workers: 0
docker:
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
container_name: "" # e.g. ray_docker
run_options: [] # Extra options to pass into "docker run"
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -54,7 +55,7 @@ auth:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: m5.large
ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu)
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
@@ -70,7 +71,7 @@ head_node:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu)
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
@@ -88,6 +89,11 @@ file_mounts: {
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
@@ -0,0 +1,114 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "tensorflow/tensorflow:1.12.0-gpu-py3"
container_name: "ray-nvidia-docker-test" # e.g. ray_docker
run_options:
- --runtime=nvidia
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: aws
region: us-west-2
# Availability zone(s), comma-separated, that nodes may be launched in.
# Nodes are currently spread between zones by a round-robin approach,
# however this implementation detail should not be relied upon.
availability_zone: us-west-2a,us-west-2b
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
head_node:
InstanceType: p2.xlarge
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
# You can provision additional disk space with a conf as follows
BlockDeviceMappings:
- DeviceName: /dev/sda1
Ebs:
VolumeSize: 100
# Additional options in the boto docs.
# Provider-specific config for worker nodes, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
# For more documentation on available fields, see:
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
worker_nodes:
InstanceType: m5.large
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
# Run workers on spot by default. Comment this out to use on-demand.
InstanceMarketOptions:
MarketType: spot
# Additional options can be found in the boto docs, e.g.
# SpotOptions:
# MaxPrice: MAX_HOURLY_PRICE
# Additional options in the boto docs.
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of shell commands to run to set up nodes.
setup_commands:
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
+63 -40
View File
@@ -26,6 +26,7 @@ from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
TAG_RAY_NODE_NAME
from ray.autoscaler.updater import NodeUpdaterThread
from ray.autoscaler.log_timer import LogTimer
from ray.autoscaler.docker import with_docker_exec
logger = logging.getLogger(__name__)
@@ -130,9 +131,16 @@ def kill_node(config_file, yes, override_cluster_name):
node = random.choice(nodes)
logger.info("kill_node: Terminating worker {}".format(node))
updater = NodeUpdaterThread(node, config["provider"], provider,
config["auth"], config["cluster_name"],
config["file_mounts"], [], "")
updater = NodeUpdaterThread(
node_id=node,
provider_config=config["provider"],
provider=provider,
auth_config=config["auth"],
cluster_name=config["cluster_name"],
file_mounts=config["file_mounts"],
initialization_commands=[],
setup_commands=[],
runtime_hash="")
_exec(updater, "ray stop", False, False)
@@ -222,14 +230,15 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
config["head_start_ray_commands"])
updater = NodeUpdaterThread(
head_node,
config["provider"],
provider,
config["auth"],
config["cluster_name"],
config["file_mounts"],
init_commands,
runtime_hash,
node_id=head_node,
provider_config=config["provider"],
provider=provider,
auth_config=config["auth"],
cluster_name=config["cluster_name"],
file_mounts=config["file_mounts"],
initialization_commands=config["initialization_commands"],
setup_commands=init_commands,
runtime_hash=runtime_hash,
)
updater.start()
updater.join()
@@ -247,19 +256,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
provider.external_ip(head_node)))
monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
for s in init_commands:
if ("ray start" in s and "docker exec" in s
and "--autoscaling-config" in s):
monitor_str = "docker exec {} /bin/sh -c {}".format(
config["docker"]["container_name"], quote(monitor_str))
use_docker = bool(config["docker"]["container_name"])
if override_cluster_name:
modifiers = " --cluster-name={}".format(
quote(override_cluster_name))
else:
modifiers = ""
print("To monitor auto-scaling activity, you can run:\n\n"
" ray exec {} {}{}\n".format(config_file, quote(monitor_str),
modifiers))
" ray exec {} {}{}{}\n".format(
config_file, "--docker " if use_docker else " ",
quote(monitor_str), modifiers))
print("To open a console on the cluster:\n\n"
" ray attach {}{}\n".format(config_file, modifiers))
print("To ssh manually to the cluster, run:\n\n"
@@ -292,17 +298,18 @@ def attach_cluster(config_file, start, use_tmux, override_cluster_name, new):
else:
cmd = "screen -L -xRR"
exec_cluster(config_file, cmd, False, False, False, start,
exec_cluster(config_file, cmd, False, False, False, False, start,
override_cluster_name, None)
def exec_cluster(config_file, cmd, screen, tmux, stop, start,
def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start,
override_cluster_name, port_forward):
"""Runs a command on the specified cluster.
Arguments:
config_file: path to the cluster yaml
cmd: command to run
docker: whether to run command in docker container of config
screen: whether to run in a screen
tmux: whether to run in a tmux session
stop: whether to stop the cluster after command run
@@ -316,25 +323,41 @@ def exec_cluster(config_file, cmd, screen, tmux, stop, start,
if override_cluster_name is not None:
config["cluster_name"] = override_cluster_name
config = _bootstrap_config(config)
head_node = _get_head_node(
config, config_file, override_cluster_name, create_if_needed=start)
provider = get_node_provider(config["provider"], config["cluster_name"])
try:
updater = NodeUpdaterThread(
head_node,
config["provider"],
provider,
config["auth"],
config["cluster_name"],
config["file_mounts"],
[],
"",
node_id=head_node,
provider_config=config["provider"],
provider=provider,
auth_config=config["auth"],
cluster_name=config["cluster_name"],
file_mounts=config["file_mounts"],
initialization_commands=[],
setup_commands=[],
runtime_hash="",
)
def wrap_docker(command):
container_name = config["docker"]["container_name"]
if not container_name:
raise ValueError("Docker container not specified in config.")
return with_docker_exec(
[command], container_name=container_name)[0]
cmd = wrap_docker(cmd) if docker else cmd
if stop:
cmd += (
"; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes "
"--workers-only; sudo shutdown -h now")
shutdown_cmd = (
"ray stop; ray teardown ~/ray_bootstrap_config.yaml "
"--yes --workers-only")
if docker:
shutdown_cmd = wrap_docker(shutdown_cmd)
cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd))
_exec(
updater,
cmd,
@@ -378,7 +401,6 @@ def _exec(updater, cmd, screen, tmux, expect_error=False, port_forward=None):
cmd = " ".join(cmd)
updater.ssh_cmd(
cmd,
verbose=False,
allocate_tty=True,
expect_error=expect_error,
port_forward=port_forward)
@@ -405,14 +427,15 @@ def rsync(config_file, source, target, override_cluster_name, down):
provider = get_node_provider(config["provider"], config["cluster_name"])
try:
updater = NodeUpdaterThread(
head_node,
config["provider"],
provider,
config["auth"],
config["cluster_name"],
config["file_mounts"],
[],
"",
node_id=head_node,
provider_config=config["provider"],
provider=provider,
auth_config=config["auth"],
cluster_name=config["cluster_name"],
file_mounts=config["file_mounts"],
initialization_commands=[],
setup_commands=[],
runtime_hash="",
)
if down:
rsync = updater.rsync_down
+18 -22
View File
@@ -17,6 +17,8 @@ def dockerize_if_needed(config):
return config
docker_image = config["docker"].get("image")
cname = config["docker"].get("container_name")
run_options = config["docker"].get("run_options", [])
ssh_user = config["auth"]["ssh_user"]
if not docker_image:
if cname:
logger.warning(
@@ -26,10 +28,11 @@ def dockerize_if_needed(config):
else:
assert cname, "Must provide container name!"
docker_mounts = {dst: dst for dst in config["file_mounts"]}
config["setup_commands"] = (
docker_install_cmds() + docker_start_cmds(
config["auth"]["ssh_user"], docker_image, docker_mounts, cname) +
with_docker_exec(config["setup_commands"], container_name=cname))
docker_start_cmds(ssh_user, docker_image, docker_mounts, cname,
run_options) + with_docker_exec(
config["setup_commands"], container_name=cname))
config["head_setup_commands"] = with_docker_exec(
config["head_setup_commands"], container_name=cname)
@@ -58,13 +61,6 @@ def with_docker_exec(cmds, container_name, env_vars=None):
]
def docker_install_cmds():
return [
aptwait_cmd() + " && sudo apt-get update",
aptwait_cmd() + " && sudo apt-get install -y docker.io"
]
def aptwait_cmd():
return ("while sudo fuser"
" /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock"
@@ -72,13 +68,8 @@ def aptwait_cmd():
"do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done")
def docker_start_cmds(user, image, mount, cname):
def docker_start_cmds(user, image, mount, cname, user_options):
cmds = []
cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true")
cmds.append("sudo service docker start")
cmds.append("sudo usermod -a -G docker {}".format(user))
cmds.append("docker rm -f {} || true".format(cname))
cmds.append("docker pull {}".format(image))
# create flags
# ports for the redis, object manager, and tune client
@@ -94,16 +85,21 @@ def docker_start_cmds(user, image, mount, cname):
env_flags = " ".join(
["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()])
user_options_str = " ".join(user_options)
# docker run command
docker_check = [
"docker", "inspect", "-f", "'{{.State.Running}}'", cname, "||"
]
docker_run = [
"docker", "run", "--rm", "--name {}".format(cname), "-d", "-it",
port_flags, mount_flags, env_flags, "--net=host", image, "bash"
port_flags, mount_flags, env_flags, user_options_str, "--net=host",
image, "bash"
]
cmds.append(" ".join(docker_check + docker_run))
docker_update = [
" && ".join(("apt-get -y update", "apt-get -y upgrade",
"apt-get install -y git wget cmake psmisc"))
]
cmds.append(" ".join(docker_run))
docker_update = []
docker_update.append("apt-get -y update")
docker_update.append("apt-get -y upgrade")
docker_update.append("apt-get install -y git wget cmake psmisc")
cmds.extend(with_docker_exec(docker_update, container_name=cname))
return cmds
+8 -7
View File
@@ -20,6 +20,7 @@ initial_workers: 0
docker:
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
container_name: "" # e.g. ray_docker
run_options: [] # Extra options to pass into "docker run"
# The autoscaler will scale up the cluster to this target fraction of resource
@@ -66,9 +67,9 @@ head_node:
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# If the network interface is specified as below in both head and worker
# If the network interface is specified as below in both head and worker
# nodes, the manual network config is used. Otherwise an existing subnet is
# used. To use a shared subnet, ask the subnet owner to grant permission
# used. To use a shared subnet, ask the subnet owner to grant permission
# for 'compute.subnetworks.use' to the ray autoscaler account...
# networkInterfaces:
# - kind: compute#networkInterface
@@ -100,6 +101,11 @@ file_mounts: {
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
# List of commands that will be run before `setup_commands`. If docker is
# enabled, these commands will run outside the container and before docker
# is setup.
initialization_commands: []
# List of shell commands to run to set up nodes.
setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
@@ -107,11 +113,6 @@ setup_commands:
# below with a git checkout <your_sha> (and possibly a recompile).
# - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
- >-
sudo apt-get update
&& sudo apt-get install -y
psmisc
# Install Anaconda.
- >-
wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh -O ~/anaconda3.sh
@@ -0,0 +1,159 @@
# An unique identifier for the head node and workers of this cluster.
cluster_name: gpu-docker
# The minimum number of workers nodes to launch in addition to the head
# node. This number should be >= 0.
min_workers: 0
# The maximum number of workers nodes to launch in addition to the head
# node. This takes precedence over min_workers.
max_workers: 2
# The initial number of worker nodes to launch in addition to the head
# node. When the cluster is first brought up (or when it is refreshed with a
# subsequent `ray up`) this number of nodes will be started.
initial_workers: 0
# This executes all commands on all nodes in the docker container,
# and opens all the necessary ports to support the Ray cluster.
# Empty string means disabled.
docker:
image: "tensorflow/tensorflow:1.12.0-gpu-py3"
container_name: "ray-nvidia-docker-test" # e.g. ray_docker
run_options:
- --runtime=nvidia
# The autoscaler will scale up the cluster to this target fraction of resource
# usage. For example, if a cluster of 10 nodes is 100% busy and
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
# can be decreased to increase the aggressiveness of upscaling.
# This value must be less than 1.0 for scaling to happen.
target_utilization_fraction: 0.8
# If a node is idle for this many minutes, it will be removed.
idle_timeout_minutes: 5
# Cloud-provider specific configuration.
provider:
type: gcp
region: us-west1
availability_zone: us-west1-b
project_id: <project_id> # Globally unique project id
# How Ray will authenticate with newly launched nodes.
auth:
ssh_user: ubuntu
# By default Ray creates a new private keypair, but you can also use your own.
# If you do so, make sure to also set "KeyName" in the head and worker node
# configurations below. This requires that you have added the key into the
# project wide meta-data.
# ssh_private_key: /path/to/your/key.pem
# Provider-specific config for the head node, e.g. instance type. By default
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
# For more documentation on available fields, see:
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
head_node:
machineType: custom-6-16384
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
scheduling:
- onHostMaintenance: TERMINATE
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
worker_nodes:
machineType: n1-standard-2
disks:
- boot: true
autoDelete: true
type: PERSISTENT
initializeParams:
diskSizeGb: 50
# See https://cloud.google.com/compute/docs/images for more images
sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu
guestAccelerators:
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
acceleratorCount: 1
metadata:
items:
- key: install-nvidia-driver
value: "True"
# Run workers on preemtible instance by default.
# Comment this out to use on-demand.
scheduling:
- preemptible: true
- onHostMaintenance: TERMINATE
# Additional options can be found in in the compute docs at
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
# Files or directories to copy to the head and worker nodes. The format is a
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
file_mounts: {
# "/path1/on/remote/machine": "/path1/on/local/machine",
# "/path2/on/remote/machine": "/path2/on/local/machine",
}
initialization_commands:
# Wait until nvidia drivers are installed
- >-
timeout 300 bash -c "
command -v nvidia-smi && nvidia-smi
until [ \$? -eq 0 ]; do
command -v nvidia-smi && nvidia-smi
done"
# List of shell commands to run to set up nodes.
setup_commands:
# Note: if you're developing Ray, you probably want to create an AMI that
# has your Ray repo pre-cloned. Then, you can replace the pip installs
# below with a git checkout <your_sha> (and possibly a recompile).
# - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
# Install ray
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
# Custom commands that will be run on the head node after common setup.
head_setup_commands:
- pip install google-api-python-client==1.7.8
# Custom commands that will be run on worker nodes after common setup.
worker_setup_commands: []
# Command to start ray on the head node. You don't need to change this.
head_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--head
--redis-port=6379
--object-manager-port=8076
--autoscaling-config=~/ray_bootstrap_config.yaml
# Command to start ray on worker nodes. You don't need to change this.
worker_start_ray_commands:
- ray stop
- >-
ulimit -n 65536;
ray start
--redis-address=$RAY_HEAD_IP:6379
--object-manager-port=8076
+13 -11
View File
@@ -49,7 +49,8 @@ class NodeUpdater(object):
auth_config,
cluster_name,
file_mounts,
setup_cmds,
initialization_commands,
setup_commands,
runtime_hash,
process_runner=subprocess,
use_internal_ip=False):
@@ -66,7 +67,8 @@ class NodeUpdater(object):
remote: os.path.expanduser(local)
for remote, local in file_mounts.items()
}
self.setup_cmds = setup_cmds
self.initialization_commands = initialization_commands
self.setup_commands = setup_commands
self.runtime_hash = runtime_hash
def get_caller(self, check_error):
@@ -215,13 +217,15 @@ class NodeUpdater(object):
self.provider.set_node_tags(self.node_id,
{TAG_RAY_NODE_STATUS: "setting-up"})
m = "{}: Initialization commands completed".format(self.node_id)
with LogTimer("NodeUpdater: {}".format(m)):
for cmd in self.initialization_commands:
self.ssh_cmd(cmd, redirect=open("/dev/null", "w"))
m = "{}: Setup commands completed".format(self.node_id)
with LogTimer("NodeUpdater: {}".format(m)):
for cmd in self.setup_cmds:
self.ssh_cmd(
cmd,
# verbose=True,
redirect=open("/dev/null", "w"))
for cmd in self.setup_commands:
self.ssh_cmd(cmd, redirect=open("/dev/null", "w"))
def rsync_up(self, source, target, redirect=None, check_error=True):
self.set_ssh_ip_if_required()
@@ -253,7 +257,6 @@ class NodeUpdater(object):
cmd,
connect_timeout=120,
redirect=None,
verbose=False,
allocate_tty=False,
emulate_interactive=True,
expect_error=False,
@@ -261,9 +264,8 @@ class NodeUpdater(object):
self.set_ssh_ip_if_required()
if verbose:
logger.info("NodeUpdater: "
"Running {} on {}...".format(cmd, self.ssh_ip))
logger.info("NodeUpdater: Running {} on {}...".format(
cmd, self.ssh_ip))
ssh = ["ssh"]
if allocate_tty:
ssh.append("-tt")
+8 -3
View File
@@ -627,6 +627,11 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
@cli.command()
@click.argument("cluster_config_file", required=True, type=str)
@click.argument("cmd", required=True, type=str)
@click.option(
"--docker",
is_flag=True,
default=False,
help="Runs command in the docker container specified in cluster_config.")
@click.option(
"--stop",
is_flag=True,
@@ -652,9 +657,9 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
help="Override the configured cluster name.")
@click.option(
"--port-forward", required=False, type=int, help="Port to forward.")
def exec_cmd(cluster_config_file, cmd, screen, tmux, stop, start, cluster_name,
port_forward):
exec_cluster(cluster_config_file, cmd, screen, tmux, stop, start,
def exec_cmd(cluster_config_file, cmd, docker, screen, tmux, stop, start,
cluster_name, port_forward):
exec_cluster(cluster_config_file, cmd, docker, screen, tmux, stop, start,
cluster_name, port_forward)