demo/NeMo/config.yaml

runtime: null
gpt_model_file: null # GPT nemo file path
onnx_model_file: null # ONNX file path
trt_engine_file: null # TRT engine file path

# Parameters for loading from a checkpoint
checkpoint_dir: null # Path to a folder that contains a .ckpt file
checkpoint_name: null # Name of the .ckpt file within the checkpoint_dir.
hparams_file: null # Path to a .yaml file that contains the hyperparameters of the checkpoint.

batch_size: 1
use_cache: True
use_one_input: False # export ONNX model with only one input
prompts: # prompts for GPT inference
  - "How are you?"
  - "TensorRT is a Deep Learning compiler used for deep learning."

mode: 'inference' # Could change to accuracy or benchmark

inference:
  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
  temperature: 1.0 # sampling temperature
  add_BOS: True # add the bos token at the begining of the prompt
  tokens_to_generate: 30 # The maximum length of the sequence to be generated.
  all_probs: False  # whether return the log prob for all the tokens in vocab
  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
  seed: 1234

accuracy:
  dataset: Lambada
  metric: Perplexity
  top_n: 1,3,5
  tokens_to_generate: 5

benchmark:
  input_seq_len: 20
  output_seq_len: 20

# for nemo to onnx export
onnx_export_options:
  runtime_check: False
  verbose: False
  onnx_opset: 17
  do_constant_folding: True
  cache_support: False
  prune: False # Prune the ONNX model for Sparse Tensor Cores 2:4 pattern
  device: 'cuda'
  check_tolerance: 0.01
  use_fp8_storage: False
  quantize_bmms: False

# for onnx to trt export
trt_export_options:
  opt_seq_len: 128 # define the optimized sequence length
  use_tf32: True
  use_fp16: False
  use_fp8: False
  use_bf16: False
  use_strongly_typed: True # enable strongly typed mode will invalidate `use_[fp8|fp16|bf16]` flags.
  sparse: False # enable sparse in TRT engine builder
  timing_cache: 'functional.cache'

trainer:
  devices: 1
  num_nodes: 1
  accelerator: gpu
  logger: False # logger provided by exp_manager
  precision: 32 # 16, 32, or bf16

tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1
pipeline_model_parallel_split_rank: 0 # used for encoder and decoder model (0 for others)

# model architecture
model:
  max_seq_len: 256 # define the max sequence length for attention mask
  encoder_seq_length: 2048
  max_position_embeddings: ${.encoder_seq_length}
  num_layers: 24
  hidden_size: 4096
  nb_heads: 32
  head_size: 128
  vocab_size: 50304