mirror of
https://github.com/wassname/ray.git
synced 2026-06-27 19:32:11 +08:00
Streamlined slurm script and removed references to redis_password (#10827)
Co-authored-by: Ralph Kube <ralph.kube@uit.not> Co-authored-by: Richard Liaw <rliaw@berkeley.edu> Resolved Conflicts: doc/source/cluster/slurm.rst
This commit is contained in:
@@ -21,57 +21,48 @@ Here are some community-contributed templates for using SLURM with Ray:
|
||||
|
||||
.. _`Template script`: https://gist.github.com/pengzhenghao/b348db1075101a9b986c4cdfea13dcd6
|
||||
|
||||
|
||||
Starter SLURM script
|
||||
--------------------
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
#!/bin/bash
|
||||
|
||||
#SBATCH --job-name=test
|
||||
#SBATCH --cpus-per-task=5
|
||||
#SBATCH --mem-per-cpu=1GB
|
||||
#SBATCH --nodes=3
|
||||
#SBATCH --tasks-per-node 1
|
||||
#SBATCH --nodes=4
|
||||
#SBATCH --tasks-per-node=1
|
||||
#SBATCH --time=00:30:00
|
||||
#SBATCH --reservation=test
|
||||
|
||||
worker_num=2 # Must be one less that the total number of nodes
|
||||
let "worker_num=(${SLURM_NTASKS} - 1)"
|
||||
|
||||
# module load Langs/Python/3.6.4 # This will vary depending on your environment
|
||||
# source venv/bin/activate
|
||||
|
||||
nodes=$(scontrol show hostnames $SLURM_JOB_NODELIST) # Getting the node names
|
||||
nodes_array=( $nodes )
|
||||
|
||||
node1=${nodes_array[0]}
|
||||
|
||||
ip_prefix=$(srun --nodes=1 --ntasks=1 -w $node1 hostname --ip-address) # Making address
|
||||
suffix=':6379'
|
||||
ip_head=$ip_prefix$suffix
|
||||
redis_password=$(uuidgen)
|
||||
# Define the total number of CPU cores available to ray
|
||||
let "total_cores=${worker_num} * ${SLURM_CPUS_PER_TASK}"
|
||||
|
||||
suffix='6379'
|
||||
ip_head=`hostname`:$suffix
|
||||
export ip_head # Exporting for latter access by trainer.py
|
||||
|
||||
srun --nodes=1 --ntasks=1 -w $node1 ray start --block --head --redis-port=6379 --redis-password=$redis_password & # Starting the head
|
||||
# Start the ray head node on the node that executes this script by specifying --nodes=1 and --nodelist=`hostname`
|
||||
# We are using 1 task on this node and 5 CPUs (Threads). Have the dashboard listen to 0.0.0.0 to bind it to all
|
||||
# network interfaces. This allows to access the dashboard through port-forwarding:
|
||||
# Let's say the hostname=cluster-node-500 To view the dashboard on localhost:8265, set up an ssh-tunnel like this: (assuming the firewall allows it)
|
||||
# $ ssh -N -f -L 8265:cluster-node-500:8265 user@big-cluster
|
||||
srun --nodes=1 --ntasks=1 --cpus-per-task=${SLURM_CPUS_PER_TASK} --nodelist=`hostname` ray start --head --block --dashboard-host 0.0.0.0 --port=6379 --num-cpus ${SLURM_CPUS_PER_TASK} &
|
||||
sleep 5
|
||||
# Make sure the head successfully starts before any worker does, otherwise
|
||||
# the worker will not be able to connect to redis. In case of longer delay,
|
||||
# adjust the sleeptime above to ensure proper order.
|
||||
|
||||
# Now we execute worker_num worker nodes on all nodes in the allocation except hostname by
|
||||
|
||||
# Now we execute worker_num worker nodes on all nodes in the allocation except hostname by
|
||||
# specifying --nodes=${worker_num} and --exclude=`hostname`. Use 1 task per node, so worker_num tasks in total
|
||||
# (--ntasks=${worker_num}) and 5 CPUs per task (--cps-per-task=${SLURM_CPUS_PER_TASK}).
|
||||
srun --nodes=${worker_num} --ntasks=${worker_num} --cpus-per-task=${SLURM_CPUS_PER_TASK} --exclude=`hostname` ray start --address $ip_head --block --num-cpus ${SLURM_CPUS_PER_TASK} &
|
||||
sleep 5
|
||||
|
||||
for (( i=1; i<=$worker_num; i++ ))
|
||||
do
|
||||
node2=${nodes_array[$i]}
|
||||
srun --nodes=1 --ntasks=1 -w $node2 ray start --block --address=$ip_head --redis-password=$redis_password & # Starting the workers
|
||||
# Flag --block will keep ray process alive on each compute node.
|
||||
sleep 5
|
||||
done
|
||||
|
||||
python -u trainer.py $redis_password 15 # Pass the total number of allocated CPUs
|
||||
python -u trainer.py ${total_cores} # Pass the total number of allocated CPUs
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
@@ -82,10 +73,9 @@ Starter SLURM script
|
||||
import time
|
||||
import ray
|
||||
|
||||
redis_password = sys.argv[1]
|
||||
num_cpus = int(sys.argv[2])
|
||||
num_cpus = int(sys.argv[1])
|
||||
|
||||
ray.init(address=os.environ["ip_head"], _redis_password=redis_password)
|
||||
ray.init(address=os.environ["ip_head"])
|
||||
|
||||
print("Nodes in the Ray cluster:")
|
||||
print(ray.nodes())
|
||||
|
||||
Reference in New Issue
Block a user