Skip to content

Commit

Permalink
Issue-272: Added tests for K8S usecase, when MIG enabled in a mixed mode
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov committed Mar 14, 2024
1 parent 7ec7bb7 commit 93d896d
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 17 deletions.
2 changes: 1 addition & 1 deletion pkg/dcgmexporter/clock_events_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ func TestClockEventsCollector_Gather(t *testing.T) {
gpuIDsAsString[i] = fmt.Sprint(g)
}

podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpuIDsAsString))
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpuIDsAsString))
// Tell that the app is running on K8S
config.Kubernetes = true

Expand Down
24 changes: 15 additions & 9 deletions pkg/dcgmexporter/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ func (p *PodMapper) Name() string {
func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error {
_, err := os.Stat(socketPath)
if os.IsNotExist(err) {
logrus.Infof("No Kubelet socket, ignoring")
logrus.Info("No Kubelet socket, ignoring")
return nil
}

Expand All @@ -77,6 +77,8 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error

deviceToPod := p.toDeviceToPod(pods, sysInfo)

logrus.Debugf("Device to pod mapping: %+v", deviceToPod)

// Note: for loop are copies the value, if we want to change the value
// and not the copy, we need to use the indexes
for counter := range metrics {
Expand All @@ -85,14 +87,18 @@ func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error
if err != nil {
return err
}
if !p.Config.UseOldNamespace {
metrics[counter][j].Attributes[podAttribute] = deviceToPod[deviceID].Name
metrics[counter][j].Attributes[namespaceAttribute] = deviceToPod[deviceID].Namespace
metrics[counter][j].Attributes[containerAttribute] = deviceToPod[deviceID].Container
} else {
metrics[counter][j].Attributes[oldPodAttribute] = deviceToPod[deviceID].Name
metrics[counter][j].Attributes[oldNamespaceAttribute] = deviceToPod[deviceID].Namespace
metrics[counter][j].Attributes[oldContainerAttribute] = deviceToPod[deviceID].Container

podInfo, exists := deviceToPod[deviceID]
if exists {
if !p.Config.UseOldNamespace {
metrics[counter][j].Attributes[podAttribute] = podInfo.Name
metrics[counter][j].Attributes[namespaceAttribute] = podInfo.Namespace
metrics[counter][j].Attributes[containerAttribute] = podInfo.Container
} else {
metrics[counter][j].Attributes[oldPodAttribute] = podInfo.Name
metrics[counter][j].Attributes[oldNamespaceAttribute] = podInfo.Namespace
metrics[counter][j].Attributes[oldContainerAttribute] = podInfo.Container
}
}
}
}
Expand Down
32 changes: 26 additions & 6 deletions pkg/dcgmexporter/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ func TestProcessPodMapper(t *testing.T) {
socketPath = tmpDir + "/kubelet.sock"
server := grpc.NewServer()
gpus := GetGPUUUIDs(arbirtaryMetric)
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus))
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpus))

cleanup = StartMockServer(t, server, socketPath)
defer cleanup()
Expand Down Expand Up @@ -125,12 +125,14 @@ func CreateTmpDir(t *testing.T) (string, func()) {

// Contains a list of UUIDs
type PodResourcesMockServer struct {
gpus []string
resourceName string
gpus []string
}

func NewPodResourcesMockServer(used []string) *PodResourcesMockServer {
func NewPodResourcesMockServer(resourceName string, gpus []string) *PodResourcesMockServer {
return &PodResourcesMockServer{
gpus: used,
resourceName: resourceName,
gpus: gpus,
}
}

Expand All @@ -148,7 +150,7 @@ func (s *PodResourcesMockServer) List(
Name: "default",
Devices: []*podresourcesapi.ContainerDevices{
{
ResourceName: nvidiaResourceName,
ResourceName: s.resourceName,
DeviceIds: []string{gpu},
},
},
Expand All @@ -169,6 +171,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
type TestCase struct {
KubernetesGPUIDType KubernetesGPUIDType
GPUInstanceID uint
ResourceName string
MetricGPUID string
MetricGPUDevice string
MetricMigProfile string
Expand All @@ -178,43 +181,59 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
testCases := []TestCase{
{
KubernetesGPUIDType: GPUUID,
ResourceName: nvidiaResourceName,
MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
},
{
KubernetesGPUIDType: GPUUID,
ResourceName: nvidiaResourceName,
MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5",
PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5",
MetricMigProfile: "",
},
{
KubernetesGPUIDType: GPUUID,
ResourceName: nvidiaResourceName,
GPUInstanceID: 3,
MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
MetricMigProfile: "",
PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5",
},
{
KubernetesGPUIDType: DeviceName,
ResourceName: nvidiaResourceName,
GPUInstanceID: 3,
MetricMigProfile: "mig",
PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5",
},
{
KubernetesGPUIDType: DeviceName,
ResourceName: nvidiaResourceName,
MetricMigProfile: "mig",
PODGPUID: "nvidia0/gi0",
},
{
KubernetesGPUIDType: DeviceName,
ResourceName: nvidiaResourceName,
MetricGPUDevice: "0",
PODGPUID: "0/vgpu",
},
{
KubernetesGPUIDType: GPUUID,
ResourceName: nvidiaResourceName,
MetricGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5",
PODGPUID: "b8ea3855-276c-c9cb-b366-c6fa655957c5::",
},
{
KubernetesGPUIDType: GPUUID,
ResourceName: "nvidia.com/mig-1g.10gb",
MetricMigProfile: "1g.10gb",
MetricGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5",
PODGPUID: "MIG-b8ea3855-276c-c9cb-b366-c6fa655957c5",
MetricGPUDevice: "0",
GPUInstanceID: 3,
},
}

for _, tc := range testCases {
Expand All @@ -235,7 +254,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
defer cleanup()

gpus := []string{tc.PODGPUID}
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(gpus))
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(tc.ResourceName, gpus))

cleanup = StartMockServer(t, server, socketPath)
defer cleanup()
Expand All @@ -261,6 +280,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
FieldName: "DCGM_FI_DEV_POWER_USAGE",
PromType: "gauge",
}

metrics[counter] = append(metrics[counter], Metric{
GPU: "0",
GPUUUID: tc.MetricGPUID,
Expand Down
2 changes: 1 addition & 1 deletion pkg/dcgmexporter/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,7 +361,7 @@ var cpuCoreMetricsFormat = `
{{- end }}
{{ end }}`

// Template is passed here so that it isn't recompiled at each iteration
// FormatMetrics Template is passed here so that it isn't recompiled at each iteration
func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error) {
// Format metrics
var res bytes.Buffer
Expand Down
47 changes: 47 additions & 0 deletions pkg/dcgmexporter/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgmexporter

import (
"sync"
"testing"
"time"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestWaitWithTimeout(t *testing.T) {
t.Run("Returns error by timeout", func(t *testing.T) {
wg := &sync.WaitGroup{}
defer wg.Done()
wg.Add(1)
timeout := 500 * time.Millisecond
err := WaitWithTimeout(wg, timeout)
require.Error(t, err)
assert.ErrorContains(t, err, "timeout waiting for WaitGroup")
})

t.Run("Returns no error", func(t *testing.T) {
wg := &sync.WaitGroup{}
wg.Add(1)
timeout := 500 * time.Millisecond
wg.Done()
err := WaitWithTimeout(wg, timeout)
require.NoError(t, err)
})
}

0 comments on commit 93d896d

Please sign in to comment.