SBATCH --cpus-per-task=10 # number of cores per tasks SBATCH --gres=gpu:8 # number of gpus SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) SBATCH --output=%x-%j.out # output file name export GPUS_PER_NODE=8 export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) export MASTER_PORT=9901 srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ your_program.py --deepspeed ds_config.json' Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes.