Skip to content

Commit

Permalink
Support nvidia gpu metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
r4victor committed Oct 14, 2024
1 parent 6ea7d40 commit 8b3792a
Show file tree
Hide file tree
Showing 10 changed files with 127 additions and 8 deletions.
46 changes: 46 additions & 0 deletions runner/internal/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
package metrics

import (
"bytes"
"context"
"fmt"
"os"
"os/exec"
"strconv"
"strings"
"time"

"github.com/dstackai/dstack/runner/internal/log"
"github.com/dstackai/dstack/runner/internal/schemas"
)

Expand Down Expand Up @@ -39,11 +43,16 @@ func (s *MetricsCollector) GetSystemMetrics() (*schemas.SystemMetrics, error) {
return nil, err
}
memoryWorkingSet := memoryUsage - memoryCache
gpuMetrics, err := s.GetGPUMetrics()
if err != nil {
log.Debug(context.TODO(), "Failed to get gpu metrics", "err", err)
}
return &schemas.SystemMetrics{
Timestamp: timestamp.UnixMicro(),
CpuUsage: cpuUsage,
MemoryUsage: memoryUsage,
MemoryWorkingSet: memoryWorkingSet,
GPUMetrics: gpuMetrics,
}, nil
}

Expand Down Expand Up @@ -134,6 +143,43 @@ func (s *MetricsCollector) GetMemoryCacheBytes() (uint64, error) {
return 0, fmt.Errorf("inactive_file not found in cpu.stat")
}

func (s *MetricsCollector) GetGPUMetrics() (schemas.GPUMetrics, error) {
noMetrics := schemas.GPUMetrics{
GPUDetected: false,
GPUMemoryUsage: 0,
GPUUtil: 0,
}
cmd := exec.Command("nvidia-smi", "--query-gpu=memory.used,utilization.gpu", "--format=csv,noheader,nounits")
var out bytes.Buffer
cmd.Stdout = &out
if err := cmd.Run(); err != nil {
return noMetrics, fmt.Errorf("failed to execute nvidia-smi: %v", err)
}

lines := strings.Split(strings.TrimSpace(out.String()), "\n")
for _, line := range lines {
parts := strings.Split(line, ", ")
if len(parts) != 2 {
continue
}
memUsed, err := strconv.ParseUint(strings.TrimSpace(parts[0]), 10, 64)
if err != nil {
return noMetrics, fmt.Errorf("failed to parse memory used: %v", err)
}
utilization, err := strconv.ParseUint(strings.TrimSpace(strings.TrimSuffix(parts[1], "%")), 10, 64)
if err != nil {
return noMetrics, fmt.Errorf("failed to parse GPU utilization: %v", err)
}
return schemas.GPUMetrics{
GPUDetected: true,
GPUMemoryUsage: memUsed * 1024 * 1024,
GPUUtil: utilization,
}, nil
}

return noMetrics, fmt.Errorf("could not parse gpu metrics")
}

func getCgroupVersion() (int, error) {
data, err := os.ReadFile("/proc/self/mountinfo")
if err != nil {
Expand Down
15 changes: 11 additions & 4 deletions runner/internal/schemas/schemas.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,11 +89,18 @@ type HealthcheckResponse struct {
Version string `json:"version"`
}

type GPUMetrics struct {
GPUDetected bool `json:"gpu_detected"`
GPUMemoryUsage uint64 `json:"gpu_memory_usage_bytes"`
GPUUtil uint64 `json:"gpu_util_percent"`
}

type SystemMetrics struct {
Timestamp int64 `json:"timestamp_micro"`
CpuUsage uint64 `json:"cpu_usage_micro"`
MemoryUsage uint64 `json:"memory_usage_bytes"`
MemoryWorkingSet uint64 `json:"memory_working_set_bytes"`
Timestamp int64 `json:"timestamp_micro"`
CpuUsage uint64 `json:"cpu_usage_micro"`
MemoryUsage uint64 `json:"memory_usage_bytes"`
MemoryWorkingSet uint64 `json:"memory_working_set_bytes"`
GPUMetrics GPUMetrics `json:"gpu"`
}

func (c *RepoCredentials) GetProtocol() string {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ async def _collect_job_metrics(job_model: JobModel) -> Optional[JobMetricsPoint]
cpu_usage_micro=res.cpu_usage_micro,
memory_usage_bytes=res.memory_usage_bytes,
memory_working_set_bytes=res.memory_working_set_bytes,
gpu_detected=res.gpu.gpu_detected,
gpu_memory_usage_bytes=res.gpu.gpu_memory_usage_bytes,
gpu_util_percent=res.gpu.gpu_util_percent,
)


Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
"""Add JobMetricsPoint
Revision ID: d10a5325d556
Revision ID: 7f0c5281d374
Revises: a7b46c073fa1
Create Date: 2024-10-11 14:22:14.138208
Create Date: 2024-10-14 10:39:04.415000
"""

Expand All @@ -11,7 +11,7 @@
from alembic import op

# revision identifiers, used by Alembic.
revision = "d10a5325d556"
revision = "7f0c5281d374"
down_revision = "a7b46c073fa1"
branch_labels = None
depends_on = None
Expand All @@ -27,6 +27,9 @@ def upgrade() -> None:
sa.Column("cpu_usage_micro", sa.BigInteger(), nullable=False),
sa.Column("memory_usage_bytes", sa.BigInteger(), nullable=False),
sa.Column("memory_working_set_bytes", sa.BigInteger(), nullable=False),
sa.Column("gpu_detected", sa.Boolean(), nullable=False),
sa.Column("gpu_memory_usage_bytes", sa.BigInteger(), nullable=False),
sa.Column("gpu_util_percent", sa.BigInteger(), nullable=False),
sa.ForeignKeyConstraint(
["job_id"], ["jobs.id"], name=op.f("fk_job_metrics_points_job_id_jobs")
),
Expand Down
3 changes: 3 additions & 0 deletions src/dstack/_internal/server/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,3 +603,6 @@ class JobMetricsPoint(BaseModel):
cpu_usage_micro: Mapped[int] = mapped_column(BigInteger)
memory_usage_bytes: Mapped[int] = mapped_column(BigInteger)
memory_working_set_bytes: Mapped[int] = mapped_column(BigInteger)
gpu_detected: Mapped[bool] = mapped_column(Boolean)
gpu_memory_usage_bytes: Mapped[int] = mapped_column(BigInteger)
gpu_util_percent: Mapped[int] = mapped_column(BigInteger)
7 changes: 7 additions & 0 deletions src/dstack/_internal/server/schemas/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,11 +72,18 @@ class HealthcheckResponse(CoreModel):
version: str


class GPUMetrics(CoreModel):
gpu_detected: bool
gpu_memory_usage_bytes: int
gpu_util_percent: int


class MetricsResponse(CoreModel):
timestamp_micro: int
cpu_usage_micro: int
memory_usage_bytes: int
memory_working_set_bytes: int
gpu: GPUMetrics


class ShimVolumeInfo(CoreModel):
Expand Down
21 changes: 21 additions & 0 deletions src/dstack/_internal/server/services/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,27 @@ def _calculate_job_metrics(last_point: JobMetricsPoint, prev_point: JobMetricsPo
values=[last_point.memory_working_set_bytes],
)
)
metrics.append(
Metric(
name="gpu_detected",
timestamps=[timestamp],
values=[last_point.gpu_detected],
)
)
metrics.append(
Metric(
name="gpu_memory_usage_bytes",
timestamps=[timestamp],
values=[last_point.gpu_memory_usage_bytes],
)
)
metrics.append(
Metric(
name="gpu_util_percent",
timestamps=[timestamp],
values=[last_point.gpu_util_percent],
)
)
return JobMetrics(metrics=metrics)


Expand Down
6 changes: 6 additions & 0 deletions src/dstack/_internal/server/testing/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,9 @@ async def create_job_metrics_point(
cpu_usage_micro: int = 1_000_000,
memory_usage_bytes: int = 1024,
memory_working_set_bytes: int = 1024,
gpu_detected: bool = False,
gpu_memory_usage_bytes: int = 0,
gpu_util_percent: int = 0,
) -> JobMetricsPoint:
timestamp_micro = int(timestamp.timestamp() * 1_000_000)
jmp = JobMetricsPoint(
Expand All @@ -634,6 +637,9 @@ async def create_job_metrics_point(
cpu_usage_micro=cpu_usage_micro,
memory_usage_bytes=memory_usage_bytes,
memory_working_set_bytes=memory_working_set_bytes,
gpu_detected=gpu_detected,
gpu_memory_usage_bytes=gpu_memory_usage_bytes,
gpu_util_percent=gpu_util_percent,
)
session.add(jmp)
await session.commit()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
delete_metrics,
)
from dstack._internal.server.models import JobMetricsPoint
from dstack._internal.server.schemas.runner import MetricsResponse
from dstack._internal.server.schemas.runner import GPUMetrics, MetricsResponse
from dstack._internal.server.services.projects import add_project_member
from dstack._internal.server.testing.common import (
create_job,
Expand Down Expand Up @@ -63,6 +63,11 @@ async def test_collects_metrics(self, test_db, session: AsyncSession):
cpu_usage_micro=2,
memory_usage_bytes=3,
memory_working_set_bytes=4,
gpu=GPUMetrics(
gpu_detected=False,
gpu_memory_usage_bytes=0,
gpu_util_percent=0,
),
)
await collect_metrics()
SSHTunnelMock.assert_called_once()
Expand Down
18 changes: 18 additions & 0 deletions src/tests/_internal/server/routers/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ async def test_returns_metrics(self, test_db, session: AsyncSession, client: Asy
cpu_usage_micro=10 * 1_000_000,
memory_usage_bytes=1024,
memory_working_set_bytes=512,
gpu_detected=False,
gpu_memory_usage_bytes=0,
gpu_util_percent=0,
)
response = await client.get(
f"/api/project/{project.name}/metrics/job/{run.run_name}",
Expand All @@ -89,5 +92,20 @@ async def test_returns_metrics(self, test_db, session: AsyncSession, client: Asy
"timestamps": ["2023-01-02T03:04:25+00:00"],
"values": [512],
},
{
"name": "gpu_detected",
"timestamps": ["2023-01-02T03:04:25+00:00"],
"values": [False],
},
{
"name": "gpu_memory_usage_bytes",
"timestamps": ["2023-01-02T03:04:25+00:00"],
"values": [0],
},
{
"name": "gpu_util_percent",
"timestamps": ["2023-01-02T03:04:25+00:00"],
"values": [0],
},
]
}

0 comments on commit 8b3792a

Please sign in to comment.