Update cluster guide (#347)

* clarify cluster setup instructions

* update multinode documentation, update cluster script, fix minor bug in worker.py

* clarify cluster documentation and fix update_user_code
This commit is contained in:
Johann Schleier-Smith
2016-08-04 09:14:20 -07:00
committed by Robert Nishihara
parent de200ff912
commit 3ee0fd8f34
4 changed files with 106 additions and 55 deletions
+28 -19
View File
@@ -10,7 +10,6 @@ parser = argparse.ArgumentParser(description="Parse information about the cluste
parser.add_argument("--nodes", type=str, required=True, help="Test file with node IP addresses, one line per address.")
parser.add_argument("--key-file", type=str, required=True, help="Path to the file that contains the private key.")
parser.add_argument("--username", type=str, required=True, help="User name for logging in.")
parser.add_argument("--installation-directory", type=str, required=True, help="The directory in which to install Ray.")
class RayCluster(object):
"""A class for setting up, starting, and stopping Ray on a cluster.
@@ -130,7 +129,7 @@ class RayCluster(object):
""".format(self.installation_directory, self.installation_directory)
self._run_command_over_ssh_on_all_nodes_in_parallel(install_ray_command)
def start_ray(self, user_source_directory, num_workers_per_node=10):
def start_ray(self, user_source_directory=None, num_workers_per_node=10):
"""Start Ray on a cluster.
This method is used to start Ray on a cluster. It will ssh to the head node,
@@ -139,13 +138,14 @@ class RayCluster(object):
workers.
Args:
user_source_directory (str): The path to the local directory containing the
user's source code. Files and directories in this directory can be used
as modules in remote functions.
user_source_directory (Optional[str]): The path to the local directory
containing the user's source code. If provided, files and directories in
this directory can be used as modules in remote functions.
num_workers_per_node (int): The number workers to start on each node.
"""
# First update the worker code on the nodes.
remote_user_source_directory = self._update_user_code(user_source_directory)
if user_source_directory is not None:
remote_user_source_directory = self._update_user_code(user_source_directory)
scripts_directory = os.path.join(self.installation_directory, "ray/scripts")
# Start the scheduler
@@ -160,16 +160,18 @@ class RayCluster(object):
# Start the workers on each node
# The triple backslashes are used for two rounds of escaping, something like \\\" -> \" -> "
start_workers_commands = []
remote_user_source_directory_str = "\\\"{}\\\"".format(remote_user_source_directory) if user_source_directory is not None else "None"
for i, node_ip_address in enumerate(self.node_ip_addresses):
start_workers_command = """
cd "{}";
source ../setup-env.sh;
python -c "import ray; ray.services.start_node(\\\"{}:10001\\\", \\\"{}\\\", {}, user_source_directory=\\\"{}\\\")" > start_workers.out 2> start_workers.err < /dev/null &
""".format(scripts_directory, self.node_private_ip_addresses[0], self.node_private_ip_addresses[i], num_workers_per_node, remote_user_source_directory)
""".format(scripts_directory, self.node_private_ip_addresses[0], self.node_private_ip_addresses[i], num_workers_per_node, remote_user_source_directory_str)
start_workers_commands.append(start_workers_command)
self._run_command_over_ssh_on_all_nodes_in_parallel(start_workers_commands)
setup_env_path = os.path.join(self.installation_directory, "ray/setup-env.sh")
cd_location = remote_user_source_directory if user_source_directory is not None else os.path.join(self.installation_directory, "ray")
print """
The cluster has been started. You can attach to the cluster by sshing to the head node with the following command.
@@ -178,13 +180,13 @@ class RayCluster(object):
Then run the following commands.
cd {}
source {}
source {} # Add Ray to your Python path.
Then within a Python interpreter, run the following commands.
Then within a Python interpreter or script, run the following commands.
import ray
ray.init(scheduler_address="{}:10001", objstore_address="{}:20001", driver_address="{}:30001")
""".format(self.key_file, self.username, self.node_ip_addresses[0], remote_user_source_directory, setup_env_path, self.node_private_ip_addresses[0], self.node_private_ip_addresses[0], self.node_private_ip_addresses[0])
""".format(self.key_file, self.username, self.node_ip_addresses[0], cd_location, setup_env_path, self.node_private_ip_addresses[0], self.node_private_ip_addresses[0], self.node_private_ip_addresses[0])
def stop_ray(self):
"""Kill all of the processes in the Ray cluster.
@@ -195,33 +197,39 @@ class RayCluster(object):
kill_cluster_command = "killall scheduler objstore python > /dev/null 2> /dev/null"
self._run_command_over_ssh_on_all_nodes_in_parallel(kill_cluster_command)
def update_ray(self):
def update_ray(self, branch=None):
"""Pull the latest Ray source code and rebuild Ray.
This method is used for updating the Ray source code on a Ray cluster. It
will ssh to each node, will pull the latest source code from the Ray
repository, and will rerun the build script (though currently it will not
rebuild the third party libraries).
Args:
branch (Optional[str]): The branch to check out. If omitted, then stay on
the current branch.
"""
ray_directory = os.path.join(self.installation_directory, "ray")
change_branch_command = "git checkout -f {}".format(branch) if branch is not None else ""
update_cluster_command = """
cd "{}" &&
{}
git fetch &&
git reset --hard "@{{upstream}}" -- &&
(make -C "./build" clean || rm -rf "./build") &&
./build.sh
""".format(ray_directory)
""".format(ray_directory, change_branch_command)
self._run_command_over_ssh_on_all_nodes_in_parallel(update_cluster_command)
def _update_user_code(self, user_source_directory):
"""Update the user's source code on each node in the cluster.
This method is used to update the user's source code on each node in the
cluster. The local user_source_directory will be copied under ray_source_files in
the ray installation directory on the worker node. For example, if the ray
installation directory is "/d/e/f" and we call _update_source_code("~/a/b/c"),
then the contents of "~/a/b/c" on the local machine will be copied to
"/d/e/f/user_source_files/c" on each node in the cluster.
cluster. The local user_source_directory will be copied under
ray_source_files in the home directory on the worker node. For example, if
we call _update_source_code("~/a/b/c"), then the contents of "~/a/b/c" on
the local machine will be copied to "~/user_source_files/c" on each
node in the cluster.
Args:
user_source_directory (str): The path on the local machine to the directory
@@ -236,7 +244,7 @@ class RayCluster(object):
raise Exception("Directory {} does not exist.".format(user_source_directory))
# If user_source_directory is "/a/b/c", then local_directory_name is "c".
local_directory_name = os.path.split(os.path.realpath(user_source_directory))[1]
remote_directory = os.path.join(self.installation_directory, "user_source_files", local_directory_name)
remote_directory = os.path.join("user_source_files", local_directory_name)
# Remove and recreate the directory on the node.
recreate_directory_command = """
rm -r "{}";
@@ -290,7 +298,8 @@ if __name__ == "__main__":
args = parser.parse_args()
username = args.username
key_file = args.key_file
installation_directory = args.installation_directory
# Install Ray in the user's home directory on the cluster.
installation_directory = "$HOME"
node_ip_addresses = []
node_private_ip_addresses = []
for line in open(args.nodes).readlines():