mirror of
https://github.com/wassname/ray.git
synced 2026-07-01 18:04:09 +08:00
[autoscaler] docker run options (#3921)
Adds support for docker options, allowing for use of nvidia-docker. Closes #2657.
This commit is contained in:
committed by
Richard Liaw
parent
4347ab644e
commit
729d0b2825
@@ -90,6 +90,7 @@ CLUSTER_CONFIG_SCHEMA = {
|
||||
{
|
||||
"image": (str, OPTIONAL), # e.g. tensorflow/tensorflow:1.5.0-py3
|
||||
"container_name": (str, OPTIONAL), # e.g., ray_docker
|
||||
"run_options": (list, OPTIONAL),
|
||||
},
|
||||
OPTIONAL),
|
||||
|
||||
@@ -102,7 +103,12 @@ CLUSTER_CONFIG_SCHEMA = {
|
||||
# Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
|
||||
"file_mounts": (dict, OPTIONAL),
|
||||
|
||||
# List of common shell commands to run to initialize nodes.
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
"initialization_commands": (list, OPTIONAL),
|
||||
|
||||
# List of common shell commands to run to setup nodes.
|
||||
"setup_commands": (list, OPTIONAL),
|
||||
|
||||
# Commands that will be run on the head node after common setup.
|
||||
@@ -527,13 +533,16 @@ class StandardAutoscaler(object):
|
||||
"{}: No heartbeat in {}s, "
|
||||
"restarting Ray to recover...".format(node_id, delta))
|
||||
updater = NodeUpdaterThread(
|
||||
node_id,
|
||||
self.config["provider"],
|
||||
self.provider,
|
||||
self.config["auth"],
|
||||
self.config["cluster_name"], {},
|
||||
with_head_node_ip(self.config["worker_start_ray_commands"]),
|
||||
self.runtime_hash,
|
||||
node_id=node_id,
|
||||
provider_config=self.config["provider"],
|
||||
provider=self.provider,
|
||||
auth_config=self.config["auth"],
|
||||
cluster_name=self.config["cluster_name"],
|
||||
file_mounts={},
|
||||
initialization_commands=[],
|
||||
setup_commands=with_head_node_ip(
|
||||
self.config["worker_start_ray_commands"]),
|
||||
runtime_hash=self.runtime_hash,
|
||||
process_runner=self.process_runner,
|
||||
use_internal_ip=True)
|
||||
updater.start()
|
||||
@@ -561,14 +570,16 @@ class StandardAutoscaler(object):
|
||||
|
||||
def spawn_updater(self, node_id, init_commands):
|
||||
updater = NodeUpdaterThread(
|
||||
node_id,
|
||||
self.config["provider"],
|
||||
self.provider,
|
||||
self.config["auth"],
|
||||
self.config["cluster_name"],
|
||||
self.config["file_mounts"],
|
||||
with_head_node_ip(init_commands),
|
||||
self.runtime_hash,
|
||||
node_id=node_id,
|
||||
provider_config=self.config["provider"],
|
||||
provider=self.provider,
|
||||
auth_config=self.config["auth"],
|
||||
cluster_name=self.config["cluster_name"],
|
||||
file_mounts=self.config["file_mounts"],
|
||||
initialization_commands=with_head_node_ip(
|
||||
self.config["initialization_commands"]),
|
||||
setup_commands=with_head_node_ip(init_commands),
|
||||
runtime_hash=self.runtime_hash,
|
||||
process_runner=self.process_runner,
|
||||
use_internal_ip=True)
|
||||
updater.start()
|
||||
|
||||
@@ -20,6 +20,7 @@ initial_workers: 0
|
||||
docker:
|
||||
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
|
||||
container_name: "" # e.g. ray_docker
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
@@ -54,7 +55,7 @@ auth:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu)
|
||||
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
@@ -70,7 +71,7 @@ head_node:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu)
|
||||
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
@@ -88,6 +89,11 @@ file_mounts: {
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
|
||||
@@ -0,0 +1,114 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: gpu-docker
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
# subsequent `ray up`) this number of nodes will be started.
|
||||
initial_workers: 0
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "tensorflow/tensorflow:1.12.0-gpu-py3"
|
||||
container_name: "ray-nvidia-docker-test" # e.g. ray_docker
|
||||
run_options:
|
||||
- --runtime=nvidia
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: aws
|
||||
region: us-west-2
|
||||
# Availability zone(s), comma-separated, that nodes may be launched in.
|
||||
# Nodes are currently spread between zones by a round-robin approach,
|
||||
# however this implementation detail should not be relied upon.
|
||||
availability_zone: us-west-2a,us-west-2b
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
head_node:
|
||||
InstanceType: p2.xlarge
|
||||
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
|
||||
|
||||
# You can provision additional disk space with a conf as follows
|
||||
BlockDeviceMappings:
|
||||
- DeviceName: /dev/sda1
|
||||
Ebs:
|
||||
VolumeSize: 100
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Provider-specific config for worker nodes, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
|
||||
# For more documentation on available fields, see:
|
||||
# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
|
||||
worker_nodes:
|
||||
InstanceType: m5.large
|
||||
ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
|
||||
|
||||
# Run workers on spot by default. Comment this out to use on-demand.
|
||||
InstanceMarketOptions:
|
||||
MarketType: spot
|
||||
# Additional options can be found in the boto docs, e.g.
|
||||
# SpotOptions:
|
||||
# MaxPrice: MAX_HOURLY_PRICE
|
||||
|
||||
# Additional options in the boto docs.
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
|
||||
@@ -26,6 +26,7 @@ from ray.autoscaler.tags import TAG_RAY_NODE_TYPE, TAG_RAY_LAUNCH_CONFIG, \
|
||||
TAG_RAY_NODE_NAME
|
||||
from ray.autoscaler.updater import NodeUpdaterThread
|
||||
from ray.autoscaler.log_timer import LogTimer
|
||||
from ray.autoscaler.docker import with_docker_exec
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -130,9 +131,16 @@ def kill_node(config_file, yes, override_cluster_name):
|
||||
node = random.choice(nodes)
|
||||
logger.info("kill_node: Terminating worker {}".format(node))
|
||||
|
||||
updater = NodeUpdaterThread(node, config["provider"], provider,
|
||||
config["auth"], config["cluster_name"],
|
||||
config["file_mounts"], [], "")
|
||||
updater = NodeUpdaterThread(
|
||||
node_id=node,
|
||||
provider_config=config["provider"],
|
||||
provider=provider,
|
||||
auth_config=config["auth"],
|
||||
cluster_name=config["cluster_name"],
|
||||
file_mounts=config["file_mounts"],
|
||||
initialization_commands=[],
|
||||
setup_commands=[],
|
||||
runtime_hash="")
|
||||
|
||||
_exec(updater, "ray stop", False, False)
|
||||
|
||||
@@ -222,14 +230,15 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
|
||||
config["head_start_ray_commands"])
|
||||
|
||||
updater = NodeUpdaterThread(
|
||||
head_node,
|
||||
config["provider"],
|
||||
provider,
|
||||
config["auth"],
|
||||
config["cluster_name"],
|
||||
config["file_mounts"],
|
||||
init_commands,
|
||||
runtime_hash,
|
||||
node_id=head_node,
|
||||
provider_config=config["provider"],
|
||||
provider=provider,
|
||||
auth_config=config["auth"],
|
||||
cluster_name=config["cluster_name"],
|
||||
file_mounts=config["file_mounts"],
|
||||
initialization_commands=config["initialization_commands"],
|
||||
setup_commands=init_commands,
|
||||
runtime_hash=runtime_hash,
|
||||
)
|
||||
updater.start()
|
||||
updater.join()
|
||||
@@ -247,19 +256,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
|
||||
provider.external_ip(head_node)))
|
||||
|
||||
monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
|
||||
for s in init_commands:
|
||||
if ("ray start" in s and "docker exec" in s
|
||||
and "--autoscaling-config" in s):
|
||||
monitor_str = "docker exec {} /bin/sh -c {}".format(
|
||||
config["docker"]["container_name"], quote(monitor_str))
|
||||
use_docker = bool(config["docker"]["container_name"])
|
||||
if override_cluster_name:
|
||||
modifiers = " --cluster-name={}".format(
|
||||
quote(override_cluster_name))
|
||||
else:
|
||||
modifiers = ""
|
||||
print("To monitor auto-scaling activity, you can run:\n\n"
|
||||
" ray exec {} {}{}\n".format(config_file, quote(monitor_str),
|
||||
modifiers))
|
||||
" ray exec {} {}{}{}\n".format(
|
||||
config_file, "--docker " if use_docker else " ",
|
||||
quote(monitor_str), modifiers))
|
||||
print("To open a console on the cluster:\n\n"
|
||||
" ray attach {}{}\n".format(config_file, modifiers))
|
||||
print("To ssh manually to the cluster, run:\n\n"
|
||||
@@ -292,17 +298,18 @@ def attach_cluster(config_file, start, use_tmux, override_cluster_name, new):
|
||||
else:
|
||||
cmd = "screen -L -xRR"
|
||||
|
||||
exec_cluster(config_file, cmd, False, False, False, start,
|
||||
exec_cluster(config_file, cmd, False, False, False, False, start,
|
||||
override_cluster_name, None)
|
||||
|
||||
|
||||
def exec_cluster(config_file, cmd, screen, tmux, stop, start,
|
||||
def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start,
|
||||
override_cluster_name, port_forward):
|
||||
"""Runs a command on the specified cluster.
|
||||
|
||||
Arguments:
|
||||
config_file: path to the cluster yaml
|
||||
cmd: command to run
|
||||
docker: whether to run command in docker container of config
|
||||
screen: whether to run in a screen
|
||||
tmux: whether to run in a tmux session
|
||||
stop: whether to stop the cluster after command run
|
||||
@@ -316,25 +323,41 @@ def exec_cluster(config_file, cmd, screen, tmux, stop, start,
|
||||
if override_cluster_name is not None:
|
||||
config["cluster_name"] = override_cluster_name
|
||||
config = _bootstrap_config(config)
|
||||
|
||||
head_node = _get_head_node(
|
||||
config, config_file, override_cluster_name, create_if_needed=start)
|
||||
|
||||
provider = get_node_provider(config["provider"], config["cluster_name"])
|
||||
try:
|
||||
updater = NodeUpdaterThread(
|
||||
head_node,
|
||||
config["provider"],
|
||||
provider,
|
||||
config["auth"],
|
||||
config["cluster_name"],
|
||||
config["file_mounts"],
|
||||
[],
|
||||
"",
|
||||
node_id=head_node,
|
||||
provider_config=config["provider"],
|
||||
provider=provider,
|
||||
auth_config=config["auth"],
|
||||
cluster_name=config["cluster_name"],
|
||||
file_mounts=config["file_mounts"],
|
||||
initialization_commands=[],
|
||||
setup_commands=[],
|
||||
runtime_hash="",
|
||||
)
|
||||
|
||||
def wrap_docker(command):
|
||||
container_name = config["docker"]["container_name"]
|
||||
if not container_name:
|
||||
raise ValueError("Docker container not specified in config.")
|
||||
return with_docker_exec(
|
||||
[command], container_name=container_name)[0]
|
||||
|
||||
cmd = wrap_docker(cmd) if docker else cmd
|
||||
|
||||
if stop:
|
||||
cmd += (
|
||||
"; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes "
|
||||
"--workers-only; sudo shutdown -h now")
|
||||
shutdown_cmd = (
|
||||
"ray stop; ray teardown ~/ray_bootstrap_config.yaml "
|
||||
"--yes --workers-only")
|
||||
if docker:
|
||||
shutdown_cmd = wrap_docker(shutdown_cmd)
|
||||
cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd))
|
||||
|
||||
_exec(
|
||||
updater,
|
||||
cmd,
|
||||
@@ -378,7 +401,6 @@ def _exec(updater, cmd, screen, tmux, expect_error=False, port_forward=None):
|
||||
cmd = " ".join(cmd)
|
||||
updater.ssh_cmd(
|
||||
cmd,
|
||||
verbose=False,
|
||||
allocate_tty=True,
|
||||
expect_error=expect_error,
|
||||
port_forward=port_forward)
|
||||
@@ -405,14 +427,15 @@ def rsync(config_file, source, target, override_cluster_name, down):
|
||||
provider = get_node_provider(config["provider"], config["cluster_name"])
|
||||
try:
|
||||
updater = NodeUpdaterThread(
|
||||
head_node,
|
||||
config["provider"],
|
||||
provider,
|
||||
config["auth"],
|
||||
config["cluster_name"],
|
||||
config["file_mounts"],
|
||||
[],
|
||||
"",
|
||||
node_id=head_node,
|
||||
provider_config=config["provider"],
|
||||
provider=provider,
|
||||
auth_config=config["auth"],
|
||||
cluster_name=config["cluster_name"],
|
||||
file_mounts=config["file_mounts"],
|
||||
initialization_commands=[],
|
||||
setup_commands=[],
|
||||
runtime_hash="",
|
||||
)
|
||||
if down:
|
||||
rsync = updater.rsync_down
|
||||
|
||||
@@ -17,6 +17,8 @@ def dockerize_if_needed(config):
|
||||
return config
|
||||
docker_image = config["docker"].get("image")
|
||||
cname = config["docker"].get("container_name")
|
||||
run_options = config["docker"].get("run_options", [])
|
||||
ssh_user = config["auth"]["ssh_user"]
|
||||
if not docker_image:
|
||||
if cname:
|
||||
logger.warning(
|
||||
@@ -26,10 +28,11 @@ def dockerize_if_needed(config):
|
||||
else:
|
||||
assert cname, "Must provide container name!"
|
||||
docker_mounts = {dst: dst for dst in config["file_mounts"]}
|
||||
|
||||
config["setup_commands"] = (
|
||||
docker_install_cmds() + docker_start_cmds(
|
||||
config["auth"]["ssh_user"], docker_image, docker_mounts, cname) +
|
||||
with_docker_exec(config["setup_commands"], container_name=cname))
|
||||
docker_start_cmds(ssh_user, docker_image, docker_mounts, cname,
|
||||
run_options) + with_docker_exec(
|
||||
config["setup_commands"], container_name=cname))
|
||||
|
||||
config["head_setup_commands"] = with_docker_exec(
|
||||
config["head_setup_commands"], container_name=cname)
|
||||
@@ -58,13 +61,6 @@ def with_docker_exec(cmds, container_name, env_vars=None):
|
||||
]
|
||||
|
||||
|
||||
def docker_install_cmds():
|
||||
return [
|
||||
aptwait_cmd() + " && sudo apt-get update",
|
||||
aptwait_cmd() + " && sudo apt-get install -y docker.io"
|
||||
]
|
||||
|
||||
|
||||
def aptwait_cmd():
|
||||
return ("while sudo fuser"
|
||||
" /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock"
|
||||
@@ -72,13 +68,8 @@ def aptwait_cmd():
|
||||
"do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done")
|
||||
|
||||
|
||||
def docker_start_cmds(user, image, mount, cname):
|
||||
def docker_start_cmds(user, image, mount, cname, user_options):
|
||||
cmds = []
|
||||
cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true")
|
||||
cmds.append("sudo service docker start")
|
||||
cmds.append("sudo usermod -a -G docker {}".format(user))
|
||||
cmds.append("docker rm -f {} || true".format(cname))
|
||||
cmds.append("docker pull {}".format(image))
|
||||
|
||||
# create flags
|
||||
# ports for the redis, object manager, and tune client
|
||||
@@ -94,16 +85,21 @@ def docker_start_cmds(user, image, mount, cname):
|
||||
env_flags = " ".join(
|
||||
["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()])
|
||||
|
||||
user_options_str = " ".join(user_options)
|
||||
# docker run command
|
||||
docker_check = [
|
||||
"docker", "inspect", "-f", "'{{.State.Running}}'", cname, "||"
|
||||
]
|
||||
docker_run = [
|
||||
"docker", "run", "--rm", "--name {}".format(cname), "-d", "-it",
|
||||
port_flags, mount_flags, env_flags, "--net=host", image, "bash"
|
||||
port_flags, mount_flags, env_flags, user_options_str, "--net=host",
|
||||
image, "bash"
|
||||
]
|
||||
cmds.append(" ".join(docker_check + docker_run))
|
||||
docker_update = [
|
||||
" && ".join(("apt-get -y update", "apt-get -y upgrade",
|
||||
"apt-get install -y git wget cmake psmisc"))
|
||||
]
|
||||
cmds.append(" ".join(docker_run))
|
||||
docker_update = []
|
||||
docker_update.append("apt-get -y update")
|
||||
docker_update.append("apt-get -y upgrade")
|
||||
docker_update.append("apt-get install -y git wget cmake psmisc")
|
||||
cmds.extend(with_docker_exec(docker_update, container_name=cname))
|
||||
return cmds
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ initial_workers: 0
|
||||
docker:
|
||||
image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
|
||||
container_name: "" # e.g. ray_docker
|
||||
run_options: [] # Extra options to pass into "docker run"
|
||||
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
@@ -66,9 +67,9 @@ head_node:
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# If the network interface is specified as below in both head and worker
|
||||
# nodes, the manual network config is used. Otherwise an existing subnet is
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# used. To use a shared subnet, ask the subnet owner to grant permission
|
||||
# for 'compute.subnetworks.use' to the ray autoscaler account...
|
||||
# networkInterfaces:
|
||||
# - kind: compute#networkInterface
|
||||
@@ -100,6 +101,11 @@ file_mounts: {
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
# List of commands that will be run before `setup_commands`. If docker is
|
||||
# enabled, these commands will run outside the container and before docker
|
||||
# is setup.
|
||||
initialization_commands: []
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
@@ -107,11 +113,6 @@ setup_commands:
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
|
||||
|
||||
- >-
|
||||
sudo apt-get update
|
||||
&& sudo apt-get install -y
|
||||
psmisc
|
||||
|
||||
# Install Anaconda.
|
||||
- >-
|
||||
wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh -O ~/anaconda3.sh
|
||||
|
||||
@@ -0,0 +1,159 @@
|
||||
# An unique identifier for the head node and workers of this cluster.
|
||||
cluster_name: gpu-docker
|
||||
|
||||
# The minimum number of workers nodes to launch in addition to the head
|
||||
# node. This number should be >= 0.
|
||||
min_workers: 0
|
||||
|
||||
# The maximum number of workers nodes to launch in addition to the head
|
||||
# node. This takes precedence over min_workers.
|
||||
max_workers: 2
|
||||
|
||||
# The initial number of worker nodes to launch in addition to the head
|
||||
# node. When the cluster is first brought up (or when it is refreshed with a
|
||||
# subsequent `ray up`) this number of nodes will be started.
|
||||
initial_workers: 0
|
||||
|
||||
# This executes all commands on all nodes in the docker container,
|
||||
# and opens all the necessary ports to support the Ray cluster.
|
||||
# Empty string means disabled.
|
||||
docker:
|
||||
image: "tensorflow/tensorflow:1.12.0-gpu-py3"
|
||||
container_name: "ray-nvidia-docker-test" # e.g. ray_docker
|
||||
run_options:
|
||||
- --runtime=nvidia
|
||||
|
||||
|
||||
# The autoscaler will scale up the cluster to this target fraction of resource
|
||||
# usage. For example, if a cluster of 10 nodes is 100% busy and
|
||||
# target_utilization is 0.8, it would resize the cluster to 13. This fraction
|
||||
# can be decreased to increase the aggressiveness of upscaling.
|
||||
# This value must be less than 1.0 for scaling to happen.
|
||||
target_utilization_fraction: 0.8
|
||||
|
||||
# If a node is idle for this many minutes, it will be removed.
|
||||
idle_timeout_minutes: 5
|
||||
|
||||
# Cloud-provider specific configuration.
|
||||
provider:
|
||||
type: gcp
|
||||
region: us-west1
|
||||
availability_zone: us-west1-b
|
||||
project_id: <project_id> # Globally unique project id
|
||||
|
||||
# How Ray will authenticate with newly launched nodes.
|
||||
auth:
|
||||
ssh_user: ubuntu
|
||||
# By default Ray creates a new private keypair, but you can also use your own.
|
||||
# If you do so, make sure to also set "KeyName" in the head and worker node
|
||||
# configurations below. This requires that you have added the key into the
|
||||
# project wide meta-data.
|
||||
# ssh_private_key: /path/to/your/key.pem
|
||||
|
||||
# Provider-specific config for the head node, e.g. instance type. By default
|
||||
# Ray will auto-configure unspecified fields such as subnets and ssh-keys.
|
||||
# For more documentation on available fields, see:
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
head_node:
|
||||
machineType: custom-6-16384
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu
|
||||
guestAccelerators:
|
||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||
acceleratorCount: 1
|
||||
metadata:
|
||||
items:
|
||||
- key: install-nvidia-driver
|
||||
value: "True"
|
||||
scheduling:
|
||||
- onHostMaintenance: TERMINATE
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
worker_nodes:
|
||||
machineType: n1-standard-2
|
||||
disks:
|
||||
- boot: true
|
||||
autoDelete: true
|
||||
type: PERSISTENT
|
||||
initializeParams:
|
||||
diskSizeGb: 50
|
||||
# See https://cloud.google.com/compute/docs/images for more images
|
||||
sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu
|
||||
guestAccelerators:
|
||||
- acceleratorType: projects/<project_id>/zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80
|
||||
acceleratorCount: 1
|
||||
metadata:
|
||||
items:
|
||||
- key: install-nvidia-driver
|
||||
value: "True"
|
||||
# Run workers on preemtible instance by default.
|
||||
# Comment this out to use on-demand.
|
||||
scheduling:
|
||||
- preemptible: true
|
||||
- onHostMaintenance: TERMINATE
|
||||
|
||||
# Additional options can be found in in the compute docs at
|
||||
# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert
|
||||
|
||||
# Files or directories to copy to the head and worker nodes. The format is a
|
||||
# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
|
||||
file_mounts: {
|
||||
# "/path1/on/remote/machine": "/path1/on/local/machine",
|
||||
# "/path2/on/remote/machine": "/path2/on/local/machine",
|
||||
}
|
||||
|
||||
initialization_commands:
|
||||
# Wait until nvidia drivers are installed
|
||||
- >-
|
||||
timeout 300 bash -c "
|
||||
command -v nvidia-smi && nvidia-smi
|
||||
until [ \$? -eq 0 ]; do
|
||||
command -v nvidia-smi && nvidia-smi
|
||||
done"
|
||||
|
||||
# List of shell commands to run to set up nodes.
|
||||
setup_commands:
|
||||
# Note: if you're developing Ray, you probably want to create an AMI that
|
||||
# has your Ray repo pre-cloned. Then, you can replace the pip installs
|
||||
# below with a git checkout <your_sha> (and possibly a recompile).
|
||||
# - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc
|
||||
|
||||
# Install ray
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
|
||||
- pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
|
||||
# - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
|
||||
|
||||
# Custom commands that will be run on the head node after common setup.
|
||||
head_setup_commands:
|
||||
- pip install google-api-python-client==1.7.8
|
||||
|
||||
# Custom commands that will be run on worker nodes after common setup.
|
||||
worker_setup_commands: []
|
||||
|
||||
# Command to start ray on the head node. You don't need to change this.
|
||||
head_start_ray_commands:
|
||||
- ray stop
|
||||
- >-
|
||||
ulimit -n 65536;
|
||||
ray start
|
||||
--head
|
||||
--redis-port=6379
|
||||
--object-manager-port=8076
|
||||
--autoscaling-config=~/ray_bootstrap_config.yaml
|
||||
|
||||
# Command to start ray on worker nodes. You don't need to change this.
|
||||
worker_start_ray_commands:
|
||||
- ray stop
|
||||
- >-
|
||||
ulimit -n 65536;
|
||||
ray start
|
||||
--redis-address=$RAY_HEAD_IP:6379
|
||||
--object-manager-port=8076
|
||||
@@ -49,7 +49,8 @@ class NodeUpdater(object):
|
||||
auth_config,
|
||||
cluster_name,
|
||||
file_mounts,
|
||||
setup_cmds,
|
||||
initialization_commands,
|
||||
setup_commands,
|
||||
runtime_hash,
|
||||
process_runner=subprocess,
|
||||
use_internal_ip=False):
|
||||
@@ -66,7 +67,8 @@ class NodeUpdater(object):
|
||||
remote: os.path.expanduser(local)
|
||||
for remote, local in file_mounts.items()
|
||||
}
|
||||
self.setup_cmds = setup_cmds
|
||||
self.initialization_commands = initialization_commands
|
||||
self.setup_commands = setup_commands
|
||||
self.runtime_hash = runtime_hash
|
||||
|
||||
def get_caller(self, check_error):
|
||||
@@ -215,13 +217,15 @@ class NodeUpdater(object):
|
||||
self.provider.set_node_tags(self.node_id,
|
||||
{TAG_RAY_NODE_STATUS: "setting-up"})
|
||||
|
||||
m = "{}: Initialization commands completed".format(self.node_id)
|
||||
with LogTimer("NodeUpdater: {}".format(m)):
|
||||
for cmd in self.initialization_commands:
|
||||
self.ssh_cmd(cmd, redirect=open("/dev/null", "w"))
|
||||
|
||||
m = "{}: Setup commands completed".format(self.node_id)
|
||||
with LogTimer("NodeUpdater: {}".format(m)):
|
||||
for cmd in self.setup_cmds:
|
||||
self.ssh_cmd(
|
||||
cmd,
|
||||
# verbose=True,
|
||||
redirect=open("/dev/null", "w"))
|
||||
for cmd in self.setup_commands:
|
||||
self.ssh_cmd(cmd, redirect=open("/dev/null", "w"))
|
||||
|
||||
def rsync_up(self, source, target, redirect=None, check_error=True):
|
||||
self.set_ssh_ip_if_required()
|
||||
@@ -253,7 +257,6 @@ class NodeUpdater(object):
|
||||
cmd,
|
||||
connect_timeout=120,
|
||||
redirect=None,
|
||||
verbose=False,
|
||||
allocate_tty=False,
|
||||
emulate_interactive=True,
|
||||
expect_error=False,
|
||||
@@ -261,9 +264,8 @@ class NodeUpdater(object):
|
||||
|
||||
self.set_ssh_ip_if_required()
|
||||
|
||||
if verbose:
|
||||
logger.info("NodeUpdater: "
|
||||
"Running {} on {}...".format(cmd, self.ssh_ip))
|
||||
logger.info("NodeUpdater: Running {} on {}...".format(
|
||||
cmd, self.ssh_ip))
|
||||
ssh = ["ssh"]
|
||||
if allocate_tty:
|
||||
ssh.append("-tt")
|
||||
|
||||
@@ -627,6 +627,11 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
|
||||
@cli.command()
|
||||
@click.argument("cluster_config_file", required=True, type=str)
|
||||
@click.argument("cmd", required=True, type=str)
|
||||
@click.option(
|
||||
"--docker",
|
||||
is_flag=True,
|
||||
default=False,
|
||||
help="Runs command in the docker container specified in cluster_config.")
|
||||
@click.option(
|
||||
"--stop",
|
||||
is_flag=True,
|
||||
@@ -652,9 +657,9 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name,
|
||||
help="Override the configured cluster name.")
|
||||
@click.option(
|
||||
"--port-forward", required=False, type=int, help="Port to forward.")
|
||||
def exec_cmd(cluster_config_file, cmd, screen, tmux, stop, start, cluster_name,
|
||||
port_forward):
|
||||
exec_cluster(cluster_config_file, cmd, screen, tmux, stop, start,
|
||||
def exec_cmd(cluster_config_file, cmd, docker, screen, tmux, stop, start,
|
||||
cluster_name, port_forward):
|
||||
exec_cluster(cluster_config_file, cmd, docker, screen, tmux, stop, start,
|
||||
cluster_name, port_forward)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user