To deploy on 1 gpu: | |
deepspeed --num_gpus 1 t0.py | |
or: | |
python -m torch.distributed.run --nproc_per_node=1 t0.py | |
To deploy on 2 gpus: | |
deepspeed --num_gpus 2 t0.py | |
or: | |
python -m torch.distributed.run --nproc_per_node=2 t0.py | |
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM | |
from transformers.integrations import HfDeepSpeedConfig | |
import deepspeed | |
import os | |
import torch | |
os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers | |
distributed setup | |
local_rank = int(os.getenv("LOCAL_RANK", "0")) | |
world_size = int(os.getenv("WORLD_SIZE", "1")) | |
torch.cuda.set_device(local_rank) | |
deepspeed.init_distributed() | |
model_name = "bigscience/T0_3B" | |
config = AutoConfig.from_pretrained(model_name) | |
model_hidden_size = config.d_model | |
batch size has to be divisible by world_size, but can be bigger than world_size | |
train_batch_size = 1 * world_size | |
ds_config notes | |
- enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be | |
faster. |