File size: 3,049 Bytes
27ca8b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# inherites from base_experiment.yaml
# most of the options have docs at https://lightning.ai/docs/pytorch/stable/common/trainer.html

defaults:
  - base_experiment

tasks: [training] # tasks to run sequantially, change when your project has multiple stages and you want to run only a subset of them.
num_nodes: 1 # number of gpu servers used in large scale distributed training

training:
  precision: 16-mixed # set float precision, 16-mixed is faster while 32 is more stable
  compile: False # whether to compile the model with torch.compile
  lr: 0.001 # learning rate
  batch_size: 16 # training batch size; effective batch size is this number * gpu * nodes iff using distributed training
  max_epochs: 1000 # set to -1 to train forever
  max_steps: -1 # set to -1 to train forever, will override max_epochs
  max_time: null # set to something like "00:12:00:00" to enable
  data:
    num_workers: 4 # number of CPU threads for data preprocessing.
    shuffle: True # whether training data will be shuffled
  optim:
    accumulate_grad_batches: 1 # accumulate gradients for n batches before backprop
    gradient_clip_val: 0 # clip gradients with norm above this value, set to 0 to disable
  checkpointing:
    # these are arguments to pytorch lightning's callback, `ModelCheckpoint` class
    every_n_train_steps: 5000 # save a checkpoint every n train steps
    every_n_epochs: null # mutually exclusive with ``every_n_train_steps`` and ``train_time_interval``
    train_time_interval: null # in format of "00:12:00:00", mutually exclusive with ``every_n_train_steps`` and ``every_n_epochs``.
    enable_version_counter: False # If this is ``False``, later checkpoint will be overwrite previous ones.

validation:
  precision: 16-mixed
  compile: False # whether to compile the model with torch.compile
  batch_size: 16 # validation batch size per GPU; effective batch size is this number * gpu * nodes iff using distributed training
  val_every_n_step: 2000 # controls how frequent do we run validation, can be float (fraction of epoches) or int (steps) or null (if val_every_n_epoch is set)
  val_every_n_epoch: null # if you want to do validation every n epoches, requires val_every_n_step to be null.
  limit_batch: null # if null, run through validation set. Otherwise limit the number of batches to use for validation.
  inference_mode: True # whether to run validation in inference mode (enable_grad won't work!)
  data:
    num_workers: 4 # number of CPU threads for data preprocessing, for validation.
    shuffle: False # whether validation data will be shuffled

test:
  precision: 16-mixed
  compile: False # whether to compile the model with torch.compile
  batch_size: 4 # test batch size per GPU; effective batch size is this number * gpu * nodes iff using distributed training
  limit_batch: null # if null, run through test set. Otherwise limit the number of batches to use for test.
  data:
    num_workers: 4 # number of CPU threads for data preprocessing, for test.
    shuffle: False # whether test data will be shuffled