From 60fcbeebaab724b6422598bc38f66979d7753958 Mon Sep 17 00:00:00 2001 From: rkube Date: Fri, 18 Sep 2020 17:55:56 -0400 Subject: [PATCH] Streamlined slurm script and removed references to redis_password (#10827) Co-authored-by: Ralph Kube Co-authored-by: Richard Liaw Resolved Conflicts: doc/source/cluster/slurm.rst --- doc/source/cluster/slurm.rst | 52 +++++++++++++++--------------------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/doc/source/cluster/slurm.rst b/doc/source/cluster/slurm.rst index 9f033c165..38729bf46 100644 --- a/doc/source/cluster/slurm.rst +++ b/doc/source/cluster/slurm.rst @@ -21,57 +21,48 @@ Here are some community-contributed templates for using SLURM with Ray: .. _`Template script`: https://gist.github.com/pengzhenghao/b348db1075101a9b986c4cdfea13dcd6 + Starter SLURM script -------------------- .. code-block:: bash #!/bin/bash - #SBATCH --job-name=test #SBATCH --cpus-per-task=5 #SBATCH --mem-per-cpu=1GB - #SBATCH --nodes=3 - #SBATCH --tasks-per-node 1 + #SBATCH --nodes=4 + #SBATCH --tasks-per-node=1 + #SBATCH --time=00:30:00 + #SBATCH --reservation=test - worker_num=2 # Must be one less that the total number of nodes + let "worker_num=(${SLURM_NTASKS} - 1)" - # module load Langs/Python/3.6.4 # This will vary depending on your environment - # source venv/bin/activate - - nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names - nodes_array=( $nodes ) - - node1=${nodes_array[0]} - - ip_prefix=$(srun --nodes=1 --ntasks=1 -w $node1 hostname --ip-address) # Making address - suffix=':6379' - ip_head=$ip_prefix$suffix - redis_password=$(uuidgen) + # Define the total number of CPU cores available to ray + let "total_cores=${worker_num} * ${SLURM_CPUS_PER_TASK}" + suffix='6379' + ip_head=`hostname`:$suffix export ip_head # Exporting for latter access by trainer.py - srun --nodes=1 --ntasks=1 -w $node1 ray start --block --head --redis-port=6379 --redis-password=$redis_password & # Starting the head + # Start the ray head node on the node that executes this script by specifying --nodes=1 and --nodelist=`hostname` + # We are using 1 task on this node and 5 CPUs (Threads). Have the dashboard listen to 0.0.0.0 to bind it to all + # network interfaces. This allows to access the dashboard through port-forwarding: + # Let's say the hostname=cluster-node-500 To view the dashboard on localhost:8265, set up an ssh-tunnel like this: (assuming the firewall allows it) + # $ ssh -N -f -L 8265:cluster-node-500:8265 user@big-cluster + srun --nodes=1 --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} --nodelist=`hostname` ray start --head --block --dashboard-host 0.0.0.0 --port=6379 --num-cpus ${SLURM_CPUS_PER_TASK} & sleep 5 # Make sure the head successfully starts before any worker does, otherwise # the worker will not be able to connect to redis. In case of longer delay, # adjust the sleeptime above to ensure proper order. - - # Now we execute worker_num worker nodes on all nodes in the allocation except hostname by + + # Now we execute worker_num worker nodes on all nodes in the allocation except hostname by # specifying --nodes=${worker_num} and --exclude=`hostname`. Use 1 task per node, so worker_num tasks in total # (--ntasks=${worker_num}) and 5 CPUs per task (--cps-per-task=${SLURM_CPUS_PER_TASK}). srun --nodes=${worker_num} --ntasks=${worker_num} --cpus-per-task=${SLURM_CPUS_PER_TASK} --exclude=`hostname` ray start --address $ip_head --block --num-cpus ${SLURM_CPUS_PER_TASK} & sleep 5 - for (( i=1; i<=$worker_num; i++ )) - do - node2=${nodes_array[$i]} - srun --nodes=1 --ntasks=1 -w $node2 ray start --block --address=$ip_head --redis-password=$redis_password & # Starting the workers - # Flag --block will keep ray process alive on each compute node. - sleep 5 - done - - python -u trainer.py $redis_password 15 # Pass the total number of allocated CPUs + python -u trainer.py ${total_cores} # Pass the total number of allocated CPUs .. code-block:: python @@ -82,10 +73,9 @@ Starter SLURM script import time import ray - redis_password = sys.argv[1] - num_cpus = int(sys.argv[2]) + num_cpus = int(sys.argv[1]) - ray.init(address=os.environ["ip_head"], _redis_password=redis_password) + ray.init(address=os.environ["ip_head"]) print("Nodes in the Ray cluster:") print(ray.nodes())