mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:00:36 +08:00
Update cluster guide (#347)
* clarify cluster setup instructions * update multinode documentation, update cluster script, fix minor bug in worker.py * clarify cluster documentation and fix update_user_code
This commit is contained in:
committed by
Robert Nishihara
parent
de200ff912
commit
3ee0fd8f34
+28
-19
@@ -10,7 +10,6 @@ parser = argparse.ArgumentParser(description="Parse information about the cluste
|
||||
parser.add_argument("--nodes", type=str, required=True, help="Test file with node IP addresses, one line per address.")
|
||||
parser.add_argument("--key-file", type=str, required=True, help="Path to the file that contains the private key.")
|
||||
parser.add_argument("--username", type=str, required=True, help="User name for logging in.")
|
||||
parser.add_argument("--installation-directory", type=str, required=True, help="The directory in which to install Ray.")
|
||||
|
||||
class RayCluster(object):
|
||||
"""A class for setting up, starting, and stopping Ray on a cluster.
|
||||
@@ -130,7 +129,7 @@ class RayCluster(object):
|
||||
""".format(self.installation_directory, self.installation_directory)
|
||||
self._run_command_over_ssh_on_all_nodes_in_parallel(install_ray_command)
|
||||
|
||||
def start_ray(self, user_source_directory, num_workers_per_node=10):
|
||||
def start_ray(self, user_source_directory=None, num_workers_per_node=10):
|
||||
"""Start Ray on a cluster.
|
||||
|
||||
This method is used to start Ray on a cluster. It will ssh to the head node,
|
||||
@@ -139,13 +138,14 @@ class RayCluster(object):
|
||||
workers.
|
||||
|
||||
Args:
|
||||
user_source_directory (str): The path to the local directory containing the
|
||||
user's source code. Files and directories in this directory can be used
|
||||
as modules in remote functions.
|
||||
user_source_directory (Optional[str]): The path to the local directory
|
||||
containing the user's source code. If provided, files and directories in
|
||||
this directory can be used as modules in remote functions.
|
||||
num_workers_per_node (int): The number workers to start on each node.
|
||||
"""
|
||||
# First update the worker code on the nodes.
|
||||
remote_user_source_directory = self._update_user_code(user_source_directory)
|
||||
if user_source_directory is not None:
|
||||
remote_user_source_directory = self._update_user_code(user_source_directory)
|
||||
|
||||
scripts_directory = os.path.join(self.installation_directory, "ray/scripts")
|
||||
# Start the scheduler
|
||||
@@ -160,16 +160,18 @@ class RayCluster(object):
|
||||
# Start the workers on each node
|
||||
# The triple backslashes are used for two rounds of escaping, something like \\\" -> \" -> "
|
||||
start_workers_commands = []
|
||||
remote_user_source_directory_str = "\\\"{}\\\"".format(remote_user_source_directory) if user_source_directory is not None else "None"
|
||||
for i, node_ip_address in enumerate(self.node_ip_addresses):
|
||||
start_workers_command = """
|
||||
cd "{}";
|
||||
source ../setup-env.sh;
|
||||
python -c "import ray; ray.services.start_node(\\\"{}:10001\\\", \\\"{}\\\", {}, user_source_directory=\\\"{}\\\")" > start_workers.out 2> start_workers.err < /dev/null &
|
||||
""".format(scripts_directory, self.node_private_ip_addresses[0], self.node_private_ip_addresses[i], num_workers_per_node, remote_user_source_directory)
|
||||
""".format(scripts_directory, self.node_private_ip_addresses[0], self.node_private_ip_addresses[i], num_workers_per_node, remote_user_source_directory_str)
|
||||
start_workers_commands.append(start_workers_command)
|
||||
self._run_command_over_ssh_on_all_nodes_in_parallel(start_workers_commands)
|
||||
|
||||
setup_env_path = os.path.join(self.installation_directory, "ray/setup-env.sh")
|
||||
cd_location = remote_user_source_directory if user_source_directory is not None else os.path.join(self.installation_directory, "ray")
|
||||
print """
|
||||
The cluster has been started. You can attach to the cluster by sshing to the head node with the following command.
|
||||
|
||||
@@ -178,13 +180,13 @@ class RayCluster(object):
|
||||
Then run the following commands.
|
||||
|
||||
cd {}
|
||||
source {}
|
||||
source {} # Add Ray to your Python path.
|
||||
|
||||
Then within a Python interpreter, run the following commands.
|
||||
Then within a Python interpreter or script, run the following commands.
|
||||
|
||||
import ray
|
||||
ray.init(scheduler_address="{}:10001", objstore_address="{}:20001", driver_address="{}:30001")
|
||||
""".format(self.key_file, self.username, self.node_ip_addresses[0], remote_user_source_directory, setup_env_path, self.node_private_ip_addresses[0], self.node_private_ip_addresses[0], self.node_private_ip_addresses[0])
|
||||
""".format(self.key_file, self.username, self.node_ip_addresses[0], cd_location, setup_env_path, self.node_private_ip_addresses[0], self.node_private_ip_addresses[0], self.node_private_ip_addresses[0])
|
||||
|
||||
def stop_ray(self):
|
||||
"""Kill all of the processes in the Ray cluster.
|
||||
@@ -195,33 +197,39 @@ class RayCluster(object):
|
||||
kill_cluster_command = "killall scheduler objstore python > /dev/null 2> /dev/null"
|
||||
self._run_command_over_ssh_on_all_nodes_in_parallel(kill_cluster_command)
|
||||
|
||||
def update_ray(self):
|
||||
def update_ray(self, branch=None):
|
||||
"""Pull the latest Ray source code and rebuild Ray.
|
||||
|
||||
This method is used for updating the Ray source code on a Ray cluster. It
|
||||
will ssh to each node, will pull the latest source code from the Ray
|
||||
repository, and will rerun the build script (though currently it will not
|
||||
rebuild the third party libraries).
|
||||
|
||||
Args:
|
||||
branch (Optional[str]): The branch to check out. If omitted, then stay on
|
||||
the current branch.
|
||||
"""
|
||||
ray_directory = os.path.join(self.installation_directory, "ray")
|
||||
change_branch_command = "git checkout -f {}".format(branch) if branch is not None else ""
|
||||
update_cluster_command = """
|
||||
cd "{}" &&
|
||||
{}
|
||||
git fetch &&
|
||||
git reset --hard "@{{upstream}}" -- &&
|
||||
(make -C "./build" clean || rm -rf "./build") &&
|
||||
./build.sh
|
||||
""".format(ray_directory)
|
||||
""".format(ray_directory, change_branch_command)
|
||||
self._run_command_over_ssh_on_all_nodes_in_parallel(update_cluster_command)
|
||||
|
||||
def _update_user_code(self, user_source_directory):
|
||||
"""Update the user's source code on each node in the cluster.
|
||||
|
||||
This method is used to update the user's source code on each node in the
|
||||
cluster. The local user_source_directory will be copied under ray_source_files in
|
||||
the ray installation directory on the worker node. For example, if the ray
|
||||
installation directory is "/d/e/f" and we call _update_source_code("~/a/b/c"),
|
||||
then the contents of "~/a/b/c" on the local machine will be copied to
|
||||
"/d/e/f/user_source_files/c" on each node in the cluster.
|
||||
cluster. The local user_source_directory will be copied under
|
||||
ray_source_files in the home directory on the worker node. For example, if
|
||||
we call _update_source_code("~/a/b/c"), then the contents of "~/a/b/c" on
|
||||
the local machine will be copied to "~/user_source_files/c" on each
|
||||
node in the cluster.
|
||||
|
||||
Args:
|
||||
user_source_directory (str): The path on the local machine to the directory
|
||||
@@ -236,7 +244,7 @@ class RayCluster(object):
|
||||
raise Exception("Directory {} does not exist.".format(user_source_directory))
|
||||
# If user_source_directory is "/a/b/c", then local_directory_name is "c".
|
||||
local_directory_name = os.path.split(os.path.realpath(user_source_directory))[1]
|
||||
remote_directory = os.path.join(self.installation_directory, "user_source_files", local_directory_name)
|
||||
remote_directory = os.path.join("user_source_files", local_directory_name)
|
||||
# Remove and recreate the directory on the node.
|
||||
recreate_directory_command = """
|
||||
rm -r "{}";
|
||||
@@ -290,7 +298,8 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
username = args.username
|
||||
key_file = args.key_file
|
||||
installation_directory = args.installation_directory
|
||||
# Install Ray in the user's home directory on the cluster.
|
||||
installation_directory = "$HOME"
|
||||
node_ip_addresses = []
|
||||
node_private_ip_addresses = []
|
||||
for line in open(args.nodes).readlines():
|
||||
|
||||
Reference in New Issue
Block a user