-
Notifications
You must be signed in to change notification settings - Fork 2.1k
/
config.yaml
87 lines (76 loc) · 2.92 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
runtime: null
gpt_model_file: null # GPT nemo file path
onnx_model_file: null # ONNX file path
trt_engine_file: null # TRT engine file path
# Parameters for loading from a checkpoint
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.
batch_size: 1
use_cache: True
use_one_input: False # export ONNX model with only one input
prompts: # prompts for GPT inference
- "How are you?"
- "TensorRT is a Deep Learning compiler used for deep learning."
mode: 'inference' # Could change to accuracy or benchmark
inference:
greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature: 1.0 # sampling temperature
add_BOS: True # add the bos token at the begining of the prompt
tokens_to_generate: 30 # The maximum length of the sequence to be generated.
all_probs: False # whether return the log prob for all the tokens in vocab
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False
seed: 1234
accuracy:
dataset: Lambada
metric: Perplexity
top_n: 1,3,5
tokens_to_generate: 5
benchmark:
input_seq_len: 20
output_seq_len: 20
# for nemo to onnx export
onnx_export_options:
runtime_check: False
verbose: False
onnx_opset: 17
do_constant_folding: True
cache_support: False
prune: False # Prune the ONNX model for Sparse Tensor Cores 2:4 pattern
device: 'cuda'
check_tolerance: 0.01
use_fp8_storage: False
quantize_bmms: False
# for onnx to trt export
trt_export_options:
opt_seq_len: 128 # define the optimized sequence length
use_tf32: True
use_fp16: False
use_fp8: False
use_bf16: False
use_strongly_typed: True # enable strongly typed mode will invalidate `use_[fp8|fp16|bf16]` flags.
sparse: False # enable sparse in TRT engine builder
timing_cache: 'functional.cache'
trainer:
devices: 1
num_nodes: 1
accelerator: gpu
logger: False # logger provided by exp_manager
precision: 32 # 16, 32, or bf16
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)
# model architecture
model:
max_seq_len: 256 # define the max sequence length for attention mask
encoder_seq_length: 2048
max_position_embeddings: ${.encoder_seq_length}
num_layers: 24
hidden_size: 4096
nb_heads: 32
head_size: 128
vocab_size: 50304