SBATCH --cpus-per-task=10 # number of cores per tasks | |
SBATCH --gres=gpu:8 # number of gpus | |
SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) | |
SBATCH --output=%x-%j.out # output file name | |
export GPUS_PER_NODE=8 | |
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) | |
export MASTER_PORT=9901 | |
srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ | |
--nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ | |
--master_addr $MASTER_ADDR --master_port $MASTER_PORT \ | |
your_program.py --deepspeed ds_config.json' | |
Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes. |