Skip to content

Commit

Permalink
Fix index lookup.
Browse files Browse the repository at this point in the history
  • Loading branch information
bethune-bryant committed Aug 2, 2024
1 parent cc2d0f0 commit c2ea30e
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 9 deletions.
6 changes: 3 additions & 3 deletions gpustat/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ def _decode(b: Union[str, bytes]) -> str:
assert isinstance(b, str)
return b

def get_gpu_info(handle: NVMLHandle, index: int = None) -> NvidiaGPUInfo:
def get_gpu_info(handle: NVMLHandle) -> NvidiaGPUInfo:
"""Get one GPU information specified by nvml handle"""

def safepcall(fn: Callable[[], Any], error_value: Any):
Expand Down Expand Up @@ -538,7 +538,7 @@ def _wrapped(*args, **kwargs):
return _wrapped

gpu_info = NvidiaGPUInfo()
gpu_info['index'] = N.nvmlDeviceGetIndex(handle) if index is None else index
gpu_info['index'] = N.nvmlDeviceGetIndex(handle)

gpu_info['name'] = _decode(N.nvmlDeviceGetName(handle))
gpu_info['uuid'] = _decode(N.nvmlDeviceGetUUID(handle))
Expand Down Expand Up @@ -638,7 +638,7 @@ def _wrapped(*args, **kwargs):
for index in gpus_to_query:
try:
handle: NVMLHandle = N.nvmlDeviceGetHandleByIndex(index)
gpu_info = get_gpu_info(handle, index)
gpu_info = get_gpu_info(handle)
gpu_stat = GPUStat(gpu_info)
except N.NVMLError_Unknown as e:
gpu_stat = InvalidGPU(index, "((Unknown Error))", e)
Expand Down
21 changes: 16 additions & 5 deletions gpustat/rocml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Imports pyrsmi and wraps it in a pynvml compatible interface."""
"""Imports amdsmi and wraps it in a pynvml compatible interface."""

# pylint: disable=protected-access

Expand All @@ -16,8 +16,13 @@

NVML_TEMPERATURE_GPU = 1

class NVMLError(Exception):
def __init__(self, message="ROCM Error"):
self.message = message
super().__init__(self.message)

class NVMLError_Unknown(Exception):
def __init__(self, message="An unknown ROCMLError has occurred"):
def __init__(self, message="An unknown ROCM Error has occurred"):
self.message = message
super().__init__(self.message)

Expand Down Expand Up @@ -46,6 +51,9 @@ def nvmlDeviceGetHandleByIndex(dev):
return amdsmi_get_processor_handles()[dev]

def nvmlDeviceGetIndex(dev):
for i, handle in enumerate(amdsmi_get_processor_handles()):
if amdsmi_get_gpu_device_bdf(dev) == amdsmi_get_gpu_device_bdf(handle):
return i
return -1

def nvmlDeviceGetName(dev):
Expand Down Expand Up @@ -107,8 +115,11 @@ def nvmlDeviceGetEnforcedPowerLimit(dev):
ComputeProcess = namedtuple('ComputeProcess', ['pid', 'usedGpuMemory'])

def nvmlDeviceGetComputeRunningProcesses(dev):
results = amdsmi_get_gpu_process_list(dev)
return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results]
try:
results = amdsmi_get_gpu_process_list(dev)
return [ComputeProcess(pid=x.pid, usedGpuMemory=x.mem) for x in results]
except Exception:
return []

def nvmlDeviceGetGraphicsRunningProcesses(dev):
return None
Expand All @@ -124,7 +135,7 @@ def nvmlDeviceGetClkFreqMax(dev):
result = amdsmi_get_clock_info(dev, AmdSmiClkType.SYS)
return result["max_clk"]

# Upon importing this module, let rocml be initialized and remain active
# Upon importing this module, let amdsmi be initialized and remain active
# throughout the lifespan of the python process (until gpustat exists).
_initialized: bool
_init_error = None
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def run(self):

install_requires = [
'nvidia-ml-py>=12.535.108', # see #107, #143, #161
'pyrsmi', #137
'amdsmi', #137
'psutil>=5.6.0', # GH-1447
'blessed>=1.17.1', # GH-126
'typing_extensions',
Expand Down

0 comments on commit c2ea30e

Please sign in to comment.