-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathcli.py
144 lines (122 loc) · 7.54 KB
/
cli.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from llm_counts.utils.config import *
from llm_counts.visual_analyzer import LLMAnalyzerVisual
import time
####################################################################################################################
def runTime(func):
"""decorator: print the cost time of run function"""
def wapper(arg, *args, **kwargs):
start = time.time()
res = func(arg, *args, **kwargs)
end = time.time()
print("="*80)
print("function name: %s" %func.__name__)
print("run time: %.4fs" %(end - start))
print("="*80)
return res
return wapper
####################################################################################################################
def print_list(list):
"""print one-dimensional list
:param list: List[int]
:return: None
"""
for i, x in enumerate(list):
print(x, end='\n')
####################################################################################################################
def llm_profile(model_name,
gpu_name: str = "a100-sxm-40gb",
bytes_per_param: int = BYTES_FP16,
bs: int = 20,
seq_len: int = 1024,
generate_len=1024,
ds_zero: int = 0,
dp_size: int = 1,
tp_size: int = 8,
pp_size: int = 1,
sp_size: int = 1,
use_kv_cache: bool = True,
act_dtype_bytes: int = BYTES_FP16,
kv_cache_bytes: int = BYTES_FP16,
flops_efficiency: float = FLOPS_EFFICIENCY,
hbm_memory_efficiency: float = HBM_MEMORY_EFFICIENCY,
intra_node_memory_efficiency=INTRA_NODE_MEMORY_EFFICIENCY,
inter_node_memory_efficiency=INTER_NODE_MEMORY_EFFICIENCY,
mode: str = "inference",
print_flag: bool = True,
visual_flag: bool = True,
) -> dict:
"""Returns dict of the total floating-point operations, MACs, parameters and latency of a llm.
Args:
model_name (str, optional): model name to query the pre-defined `model_configs.json`. Defaults to "llama-13b".
gpu_name (str, optional): gpu name to query the pre-defined `model_configs.json`. Defaults to "v100-sxm2-32gb".
bs (int, optional): _description_. Defaults to 1.
seq_len (int, optional): batch size per GPU.. Defaults to 522.
generate_len (int, optional): The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. Defaults to 1526.
ds_zero (int, optional): which DeepSpeed ZeRO stage to use.. Defaults to 0.
dp_size (int, optional): data parallelism size. Defaults to 1.
tp_size (int, optional): tensor parallelism size. Defaults to 1.
pp_size (int, optional): pipeline parallelism size. Defaults to 1.
sp_size (int, optional): sequence parallelism size. Defaults to 1.
use_kv_cache (bool, optional): Whether or not the model should use the past last key/values attentions (if applicable to the model) to
speed up decoding. Defaults to True.
layernorm_dtype_bytes (int, optional): number of bytes in the data type for the layernorm activations.. Defaults to BYTES_FP16.
kv_cache_bytes (int, optional): number of bytes in the data type for the kv_cache. Defaults to None.
flops_efficiency (float, optional): flops efficiency, ranging from 0 to 1. Defaults to None.
hbm_memory_efficiency (float, optional): GPU HBM memory efficiency, ranging from 0 to 1. Defaults to HBM_MEMORY_EFFICIENCY.
intra_node_memory_efficiency (_type_, optional): intra-node memory efficiency, ranging from 0 to 1.. Defaults to INTRA_NODE_MEMORY_EFFICIENCY.
inter_node_memory_efficiency (_type_, optional): inter-node memory efficiency, ranging from 0 to 1.. Defaults to INTER_NODE_MEMORY_EFFICIENCY.
mode (str, optional): model training or inference. Defaults to "inference".
Returns:
dict: a summary dictionary of the inference analysis
"""
model_config, gpu_config = get_model_and_gpu_config_by_name(model_name, gpu_name)
parallelism_config = ParallelismConfig(tp_size=tp_size, pp_size=pp_size,
dp_size=dp_size, sp_size=sp_size
)
inference_config = InferenceConfig(bs=bs, seq_len=seq_len,
generate_len=generate_len, use_kv_cache=use_kv_cache,
bytes_per_param=bytes_per_param,
act_dtype_bytes=act_dtype_bytes,
kv_cache_bytes=kv_cache_bytes
)
gpu_efficiency_config = GPUEfficiencyConfig(flops_efficiency=flops_efficiency,
hbm_memory_efficiency=hbm_memory_efficiency,
intra_node_memory_efficiency=intra_node_memory_efficiency,
inter_node_memory_efficiency=inter_node_memory_efficiency
)
llm_configs = LLMConfigs(model_config=model_config, gpu_config=gpu_config,
parallelism_config=parallelism_config, inference_config=inference_config,
gpu_efficiency_config=gpu_efficiency_config
)
profiler = LLMAnalyzerVisual(llm_configs)
max_batch_total_tokens = profiler.infer_profile(bs=bs, seq_len=seq_len,
generate_len=generate_len, use_kv_cache=use_kv_cache,
act_dtype_bytes=act_dtype_bytes,
flops_efficiency=flops_efficiency,
hbm_memory_efficiency=hbm_memory_efficiency,
print_flag=print_flag)
return max_batch_total_tokens
def print_all_llm_analyzer():
model_name_list = ["llama-7b", "llama-13b", "llama-65b", "llama2-70b", "internlm-20b"]
gpu_name_list = ["a30-sxm-24gb", "a40-pcie-48gb", "a100-sxm-40gb", "a100-sxm-80gb", "910b-64gb", "v100-sxm-32gb", "t4-pcie-15gb"]
tp_nums_list = [1, 2, 4, 8]
tgi_service_dict_list = []
seq_len, generate_len = 1024, 1024
for model_name in model_name_list:
if model_name in ["llama2-70b", "internlm-20b"]:
seq_len, generate_len = 1024, 1024
for gpu_name in gpu_name_list:
for tp_size in tp_nums_list:
try:
max_batch_total_tokens = int(llm_profile(model_name=model_name, gpu_name=gpu_name, tp_size=tp_size,
seq_len=seq_len, generate_len=generate_len, print_flag=False))
except Exception as e:
print(f"model_name: {model_name}, gpu_name: {gpu_name}, tp_size: {tp_size}, error: {e}")
continue
tgi_service_dict = {"model_name": model_name, "gpu_name": gpu_name, "tp_size": tp_size, "max_batch_total_tokens": max_batch_total_tokens, "max_bs": floor(max_batch_total_tokens / (seq_len + generate_len))}
tgi_service_dict_list.append(tgi_service_dict)
print("================================== TGI+LightLLM service max_batch_total_tokens params list =============================")
print_list(tgi_service_dict_list)
if __name__ == "__main__":
# llm_profile(model_name="llama-7b", tp_size=1, print_flag=True, visual_flag=True)
llm_profile(model_name="llama2-70b", tp_size=8, print_flag=True, visual_flag=True)