SBATCH --cpus-per-task=10           # number of cores per tasks
SBATCH --gres=gpu:8                 # number of gpus
SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
SBATCH --output=%x-%j.out           # output file name
export GPUS_PER_NODE=8
export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
export MASTER_PORT=9901
srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
 --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
 --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
your_program.py  --deepspeed ds_config.json'

Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes.