Skip to content

Commit

Permalink
Merge pull request #199 from zclyne/yifan/kubelet-socket-dir
Browse files Browse the repository at this point in the history
make kubelet pod-resources socket directory configurable
  • Loading branch information
nvvfedorov authored Mar 19, 2024
2 parents 9cfb2a2 + 05eff12 commit 049c597
Show file tree
Hide file tree
Showing 6 changed files with 21 additions and 9 deletions.
3 changes: 2 additions & 1 deletion cmd/dcgm-exporter/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,9 @@ import (

"github.com/sirupsen/logrus"

"github.com/NVIDIA/dcgm-exporter/pkg/cmd"
_ "go.uber.org/automaxprocs"

"github.com/NVIDIA/dcgm-exporter/pkg/cmd"
)

var (
Expand Down
8 changes: 8 additions & 0 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ const (
CLIClockEventsCountWindowSize = "clock-events-count-window-size"
CLIEnableDCGMLog = "enable-dcgm-log"
CLIDCGMLogLevel = "dcgm-log-level"
CLIPodResourcesKubeletSocket = "pod-resources-kubelet-socket"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -223,6 +224,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Specify the DCGM log verbosity level. This parameter is effective only when the '--enable-dcgm-log' option is set to 'true'. Possible values: NONE, FATAL, ERROR, WARN, INFO, DEBUG and VERB",
EnvVars: []string{"DCGM_EXPORTER_DCGM_LOG_LEVEL"},
},
&cli.StringFlag{
Name: CLIPodResourcesKubeletSocket,
Value: "/var/lib/kubelet/pod-resources/kubelet.sock",
Usage: "Path to the kubelet pod-resources socket file",
EnvVars: []string{"DCGM_POD_RESOURCES_KUBELET_SOCKET"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -586,5 +593,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
ClockEventsCountWindowSize: c.Int(CLIClockEventsCountWindowSize),
EnableDCGMLog: c.Bool(CLIEnableDCGMLog),
DCGMLogLevel: dcgmLogLevel,
PodResourcesKubeletSocket: c.String(CLIPodResourcesKubeletSocket),
}, nil
}
3 changes: 2 additions & 1 deletion pkg/dcgmexporter/clock_events_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ func TestClockEventsCollector_Gather(t *testing.T) {
// Create a fake K8S to emulate work on K8S environment
tmpDir, cleanup := CreateTmpDir(t)
defer cleanup()
socketPath = tmpDir + "/kubelet.sock"
socketPath := tmpDir + "/kubelet.sock"
server := grpc.NewServer()

gpuIDsAsString := make([]string, len(gpuIDs))
Expand All @@ -135,6 +135,7 @@ func TestClockEventsCollector_Gather(t *testing.T) {
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpuIDsAsString))
// Tell that the app is running on K8S
config.Kubernetes = true
config.PodResourcesKubeletSocket = socketPath

allCounters := []Counter{
{
Expand Down
2 changes: 2 additions & 0 deletions pkg/dcgmexporter/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package dcgmexporter

import "github.com/NVIDIA/go-dcgm/pkg/dcgm"
Expand Down Expand Up @@ -55,4 +56,5 @@ type Config struct {
ClockEventsCountWindowSize int
EnableDCGMLog bool
DCGMLogLevel string
PodResourcesKubeletSocket string
}
4 changes: 1 addition & 3 deletions pkg/dcgmexporter/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ import (
)

var (
socketDir = "/var/lib/kubelet/pod-resources"
socketPath = socketDir + "/kubelet.sock"

connectionTimeout = 10 * time.Second

gkeMigDeviceIDRegex = regexp.MustCompile(`^nvidia([0-9]+)/gi([0-9]+)$`)
Expand All @@ -57,6 +54,7 @@ func (p *PodMapper) Name() string {
}

func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error {
socketPath := p.Config.PodResourcesKubeletSocket
_, err := os.Stat(socketPath)
if os.IsNotExist(err) {
logrus.Info("No Kubelet socket, ignoring")
Expand Down
10 changes: 6 additions & 4 deletions pkg/dcgmexporter/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,15 +55,15 @@ func TestProcessPodMapper(t *testing.T) {

arbirtaryMetric := out[reflect.ValueOf(out).MapKeys()[0].Interface().(Counter)]

socketPath = tmpDir + "/kubelet.sock"
socketPath := tmpDir + "/kubelet.sock"
server := grpc.NewServer()
gpus := GetGPUUUIDs(arbirtaryMetric)
podresourcesapi.RegisterPodResourcesListerServer(server, NewPodResourcesMockServer(nvidiaResourceName, gpus))

cleanup = StartMockServer(t, server, socketPath)
defer cleanup()

podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID})
podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: GPUUID, PodResourcesKubeletSocket: socketPath})
require.NoError(t, err)
var sysInfo SystemInfo
err = podMapper.Process(out, sysInfo)
Expand Down Expand Up @@ -246,7 +246,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
func(t *testing.T) {
tmpDir, cleanup := CreateTmpDir(t)
defer cleanup()
socketPath = tmpDir + "/kubelet.sock"
socketPath := tmpDir + "/kubelet.sock"
server := grpc.NewServer()

cleanup, err := dcgm.Init(dcgm.Embedded)
Expand All @@ -271,7 +271,9 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
nvmlGetMIGDeviceInfoByIDHook = nvmlprovider.GetMIGDeviceInfoByID
}()

podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: tc.KubernetesGPUIDType})
podMapper, err := NewPodMapper(&Config{
KubernetesGPUIdType: tc.KubernetesGPUIDType,
PodResourcesKubeletSocket: socketPath})
require.NoError(t, err)
require.NotNil(t, podMapper)
metrics := MetricsByCounter{}
Expand Down

0 comments on commit 049c597

Please sign in to comment.