#PBS -A NCIS0010 | |
#PBS -N wx6h_pred | |
#PBS -l walltime=12:00:00 | |
#PBS -l select=8:ncpus=64:ngpus=4 | |
#PBS -q main | |
#PBS -j oe | |
#PBS -k eod | |
#PBS -r n | |
# Load modules | |
module purge | |
module load gcc craype cray-mpich cuda cudnn/8.8.1.3-12 conda | |
conda activate /glade/u/home/schreck/.conda/envs/credit-derecho | |
# Export environment variables | |
export LSCRATCH=/glade/derecho/scratch/schreck/ | |
export LOGLEVEL=INFO | |
export NCCL_DEBUG=INFO | |
export CUDA_VISIBLE_DEVICES=0,1,2,3 | |
export NCCL_SOCKET_IFNAME=hsn | |
export MPICH_GPU_MANAGED_MEMORY_SUPPORT_ENABLED=1 | |
export MPICH_OFI_NIC_POLICY=GPU | |
export MPICH_GPU_SUPPORT_ENABLED=1 | |
export NCCL_IB_DISABLE=1 | |
export NCCL_CROSS_NIC=1 | |
export NCCL_NCHANNELS_PER_NET_PEER=4 | |
export MPICH_RDMA_ENABLED_CUDA=1 | |
export NCCL_NET="AWS Libfabric" | |
export NCCL_NET_GDR_LEVEL=PBH | |
export FI_CXI_DISABLE_HOST_REGISTER=1 | |
export FI_CXI_OPTIMIZED_MRS=false | |
export FI_MR_CACHE_MONITOR=userfaultfd | |
export FI_CXI_DEFAULT_CQ_SIZE=131072 | |
# logger.info the results | |
echo "Number of nodes: 8" | |
echo "Number of GPUs per node: 4" | |
echo "Total number of GPUs: 32" | |
# Log in to WandB if needed | |
# wandb login 02d2b1af00b5df901cb2bee071872de774781520 | |
# Launch MPIs | |
nodes=( $( cat $PBS_NODEFILE ) ) | |
echo nodes: $nodes | |
# Find headnode's IP: | |
head_node=${nodes[0]} | |
head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}') | |
# MASTER_ADDR=$head_node_ip MASTER_PORT=1234 mpiexec -n 32 --ppn 4 --cpu-bind none python /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml --backend nccl | |
mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml | |
# module purge | |
# module load nvhpc cuda cray-mpich conda | |
# conda activate /glade/work/ksha/miniconda3/envs/credit | |
# # Get a list of allocated nodes | |
# nodes=( $( cat $PBS_NODEFILE ) ) | |
# head_node=${nodes[0]} | |
# head_node_ip=$(ssh $head_node hostname -i | awk '{print $1}') | |
# # Export environment variables | |
# export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31" | |
# export LSCRATCH=/glade/derecho/scratch/schreck/ | |
# export LOGLEVEL=INFO | |
# #export NCCL_DEBUG=INFO | |
# export NCCL_SOCKET_IFNAME=hsn | |
# export NCCL_HOME=/glade/u/home/dhoward/work/nccl-ofi-plugin/install | |
# export LD_LIBRARY_PATH=$NCCL_HOME/lib:$NCCL_HOME/plugin/lib:$LD_LIBRARY_PATH | |
# export NCCL_NCHANNELS_PER_NET_PEER=4 | |
# export MPICH_GPU_SUPPORT_ENABLED=1 | |
# export MPICH_OFI_NIC_POLICY=GPU | |
# export MPICH_RDMA_ENABLED_CUDA=1 | |
# export NCCL_DISABLE_IB=1 | |
# export NCCL_CROSS_NIC=1 | |
# export FI_CXI_DISABLE_HOST_REGISTER=1 | |
# export FI_CXI_OPTIMIZED_MRS=false | |
# # Print the results | |
# echo "Number of nodes: 8" | |
# echo "Number of GPUs per node: 4" | |
# echo "Total number of GPUs: 32" | |
# # Log in to WandB if needed | |
# # wandb login 02d2b1af00b5df901cb2bee071872de774781520 | |
# # Launch MPIs | |
# mpiexec -n 8 --ppn 1 --cpu-bind none torchrun --nnodes=8 --nproc-per-node=4 --rdzv-backend=c10d --rdzv-endpoint=$head_node_ip /glade/u/home/ksha/miles-physics/applications/rollout_to_netcdf.py -c model_multi.yml | |