forked from LambdaLabsML/Open-Sora
-
Notifications
You must be signed in to change notification settings - Fork 1
/
nvtop_all.py
59 lines (47 loc) · 2.17 KB
/
nvtop_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import concurrent.futures
import subprocess
import pandas as pd
import sys
def get_gpu_info(node_name):
try:
result_processes = subprocess.run(
["ssh", node_name, "nvidia-smi --query-compute-apps=pid --format=csv,noheader | wc -l"],
capture_output=True,
text=True,
check=True
)
num_processes = int(result_processes.stdout.strip())
result_power = subprocess.run(
["ssh", node_name, "nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits"],
capture_output=True,
text=True,
check=True
)
power_draws = [float(p.strip()) for p in result_power.stdout.splitlines()]
mean_power = sum(power_draws) / len(power_draws) if power_draws else 0.0
return node_name, num_processes, mean_power
except subprocess.CalledProcessError as e:
return node_name, "Failed", "Failed"
def main(hostfile):
with open(hostfile, 'r') as file:
nodes = [line.strip() for line in file if line.strip()]
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = [executor.submit(get_gpu_info, node) for node in nodes]
results = [future.result() for future in concurrent.futures.as_completed(futures)]
df = pd.DataFrame(results, columns=["Node", "GPU Processes", "Mean Power Consumption (W)"])
# Calculate mean values for GPU Processes and Mean Power Consumption
mean_gpu_processes = df["GPU Processes"].replace("Failed", float('nan')).astype(float).mean()
mean_power_consumption = df["Mean Power Consumption (W)"].replace("Failed", float('nan')).astype(float).mean()
#df.loc["Mean"] = ["", mean_gpu_processes, mean_power_consumption]
# Set pandas options to display the entire DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
print(df)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python script.py <hostfile>")
sys.exit(1)
hostfile = sys.argv[1]
main(hostfile)