-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGPT-tiny.yaml
15 lines (15 loc) · 1.2 KB
/
GPT-tiny.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
output_dir: outputs/GPT-tiny # <- where the output files are written
tokenizer_encoding: gpt2 # <- the tokenizer encoding, used by tiktoken (YOU SHOULD NOT CHANGE THIS)
model_config:
n_embd: 256 # <- dimension of token and positional embeddings
n_head: 16 # <- number of attention heads in multihead attention
n_positions: 128 # <- the maximum number of tokens that the model can take
n_layer: 4 # <- number of decoder blocks
device: auto # <- which device to put the model on (YOU DO NOT NEED TO CHANGE THIS)
batch_size: 32 # <- number of sequences to feed into the model at a time
seq_len: 128 # <- length of each sequence in training and evaluation, <= model_config.n_positions
num_warmup_steps: 2000 # <- number of warmup steps in cosine annealing
num_training_steps: 90000 # <- number of training steps in cosine annealing
grad_accumulation_steps: 1 # <- number of micro steps of gradient accumulation before every model update
min_lr: 1e-4 # <- minimum learning rate in cosine annealing
max_lr: 1e-3 # <- maximum learning rate in cosine annealing