Skip to content

Commit

Permalink
Merge pull request #247 from NVIDIA/code-improvements
Browse files Browse the repository at this point in the history
Resolved linter-identified issues for improved code quality
  • Loading branch information
nvvfedorov authored Feb 9, 2024
2 parents 3cc7a2d + 5de5efb commit 3250bfe
Show file tree
Hide file tree
Showing 19 changed files with 603 additions and 179 deletions.
1 change: 1 addition & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"args": [
"-f",
"./etc/default-counters.csv",
"--debug"
]
}
]
Expand Down
2 changes: 1 addition & 1 deletion docker/Dockerfile.ubi9
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ WORKDIR /go/src/github.com/NVIDIA/dcgm-exporter
RUN set -eux; \
dnf clean expire-cache; \
dnf install -y go-toolset make wget
RUN dnf clean all
RUN dnf clean all && rm -rf /usr/bin/go

# Install Go official release
RUN set -eux; \
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ require (
github.com/avast/retry-go/v4 v4.5.1
github.com/bits-and-blooms/bitset v1.13.0
github.com/gorilla/mux v1.8.1
github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16
github.com/prometheus/common v0.45.0
github.com/prometheus/exporter-toolkit v0.11.0
github.com/sirupsen/logrus v1.9.3
Expand Down Expand Up @@ -71,7 +72,6 @@ require (
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/prometheus/client_golang v1.17.0 // indirect
github.com/prometheus/client_model v0.4.1-0.20230718164431-9a2bf3000d16 // indirect
github.com/prometheus/procfs v0.11.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
Expand Down
18 changes: 18 additions & 0 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ const (
CLIWebConfigFile = "web-config-file"
CLIXIDCountWindowSize = "xid-count-window-size"
CLIReplaceBlanksInModelName = "replace-blanks-in-model-name"
CLIDebugMode = "debug"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -190,6 +191,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "Replaces every blank space in the GPU model name with a dash, ensuring a continuous, space-free identifier.",
EnvVars: []string{"DCGM_EXPORTER_REPLACE_BLANKS_IN_MODEL_NAME"},
},
&cli.BoolFlag{
Name: CLIDebugMode,
Value: false,
Usage: "Enable debug output",
EnvVars: []string{"DCGM_EXPORTER_DEBUG"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -228,6 +235,16 @@ restart:
return err
}

if config.Debug {
//enable debug logging
logrus.SetLevel(logrus.DebugLevel)
logrus.Debug("Debug output is enabled")
}

logrus.Debugf("Command line: %s", strings.Join(os.Args, " "))

logrus.WithField(dcgmexporter.LoggerDumpKey, fmt.Sprintf("%+v", config)).Debug("Loaded configuration")

if config.UseRemoteHE {
logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo)
cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0")
Expand Down Expand Up @@ -426,5 +443,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
WebConfigFile: c.String(CLIWebConfigFile),
XIDCountWindowSize: c.Int(CLIXIDCountWindowSize),
ReplaceBlanksInModelName: c.Bool(CLIReplaceBlanksInModelName),
Debug: c.Bool(CLIDebugMode),
}, nil
}
11 changes: 11 additions & 0 deletions pkg/dcgmexporter/const.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,14 @@ func mustParseDCGMExporterMetric(s string) DCGMExporterMetric {
}
return mv
}

// Constants for logging fields
const (
LoggerGroupIDKey = "groupID"
LoggerDumpKey = "dump"
)

const (
PARENT_ID_IGNORED = 0
DCGM_ST_NOT_CONFIGURED = "Setting not configured"
)
15 changes: 13 additions & 2 deletions pkg/dcgmexporter/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"math/rand"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
)

func NewGroup() (dcgm.GroupHandle, func(), error) {
Expand All @@ -29,7 +30,12 @@ func NewGroup() (dcgm.GroupHandle, func(), error) {
return dcgm.GroupHandle{}, func() {}, err
}

return group, func() { dcgm.DestroyGroup(group) }, nil
return group, func() {
err := dcgm.DestroyGroup(group)
if err != nil {
logrus.WithError(err).Warn("Cannot destroy field group")
}
}, nil
}

func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short {
Expand All @@ -56,7 +62,12 @@ func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
return dcgm.FieldHandle{}, func() {}, err
}

return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil
return fieldGroup, func() {
err := dcgm.FieldGroupDestroy(fieldGroup)
if err != nil {
logrus.WithError(err).Warn("Cannot destroy field group")
}
}, nil
}

func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error {
Expand Down
10 changes: 5 additions & 5 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,10 +88,10 @@ func (c *DCGMCollector) Cleanup() {
}
}

func (c *DCGMCollector) GetMetrics() (map[Counter][]Metric, error) {
func (c *DCGMCollector) GetMetrics() (MetricsByCounter, error) {
monitoringInfo := GetMonitoredEntities(c.SysInfo)

metrics := make(map[Counter][]Metric)
metrics := make(MetricsByCounter)

for _, mi := range monitoringInfo {
var vals []dcgm.FieldValue_v1
Expand Down Expand Up @@ -153,7 +153,7 @@ func FindCounterField(c []Counter, fieldId uint) (Counter, error) {
return c[0], fmt.Errorf("Could not find corresponding counter")
}

func ToSwitchMetric(metrics map[Counter][]Metric,
func ToSwitchMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
labels := map[string]string{}

Expand Down Expand Up @@ -196,7 +196,7 @@ func ToSwitchMetric(metrics map[Counter][]Metric,
}
}

func ToCPUMetric(metrics map[Counter][]Metric,
func ToCPUMetric(metrics MetricsByCounter,
values []dcgm.FieldValue_v1, c []Counter, mi MonitoringInfo, useOld bool, hostname string) {
var labels = map[string]string{}

Expand Down Expand Up @@ -240,7 +240,7 @@ func ToCPUMetric(metrics map[Counter][]Metric,
}

func ToMetric(
metrics map[Counter][]Metric,
metrics MetricsByCounter,
values []dcgm.FieldValue_v1,
c []Counter,
d dcgm.Device,
Expand Down
13 changes: 9 additions & 4 deletions pkg/dcgmexporter/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider"
"github.com/sirupsen/logrus"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
)

Expand All @@ -54,7 +55,7 @@ func (p *PodMapper) Name() string {
return "podMapper"
}

func (p *PodMapper) Process(metrics map[Counter][]Metric, sysInfo SystemInfo) error {
func (p *PodMapper) Process(metrics MetricsByCounter, sysInfo SystemInfo) error {
_, err := os.Stat(socketPath)
if os.IsNotExist(err) {
logrus.Infof("No Kubelet socket, ignoring")
Expand Down Expand Up @@ -102,9 +103,13 @@ func connectToServer(socket string) (*grpc.ClientConn, func(), error) {
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
defer cancel()

conn, err := grpc.DialContext(ctx, socket, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
conn, err := grpc.DialContext(ctx,
socket,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
d := net.Dialer{}
return d.DialContext(ctx, "unix", addr)
}),
)

Expand Down
5 changes: 3 additions & 2 deletions pkg/dcgmexporter/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,8 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() {
stopped := make(chan interface{})

go func() {
server.Serve(l)
err := server.Serve(l)
assert.NoError(t, err)
close(stopped)
}()

Expand Down Expand Up @@ -256,7 +257,7 @@ func TestProcessPodMapper_WithD_Different_Format_Of_DeviceID(t *testing.T) {
podMapper, err := NewPodMapper(&Config{KubernetesGPUIdType: tc.KubernetesGPUIDType})
require.NoError(t, err)
require.NotNil(t, podMapper)
metrics := map[Counter][]Metric{}
metrics := MetricsByCounter{}
counter := Counter{
FieldID: 155,
FieldName: "DCGM_FI_DEV_POWER_USAGE",
Expand Down
5 changes: 4 additions & 1 deletion pkg/dcgmexporter/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ import (
)

func NewMetricsPipeline(c *Config, counters []Counter, hostname string, newDCGMCollector DCGMCollectorConstructor) (*MetricsPipeline, func(), error) {

logrus.WithField(LoggerDumpKey, fmt.Sprintf("%+v", counters)).Debug("Counters are initialized")

cleanups := []func(){}
gpuCollector, cleanup, err := newDCGMCollector(counters, c, hostname, dcgm.FE_GPU)
if err != nil {
Expand Down Expand Up @@ -328,7 +331,7 @@ var cpuCoreMetricsFormat = `
{{ end }}`

// Template is passed here so that it isn't recompiled at each iteration
func FormatMetrics(t *template.Template, groupedMetrics map[Counter][]Metric) (string, error) {
func FormatMetrics(t *template.Template, groupedMetrics MetricsByCounter) (string, error) {
// Format metrics
var res bytes.Buffer
if err := t.Execute(&res, groupedMetrics); err != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/dcgmexporter/pipeline_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func TestRun(t *testing.T) {
p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c)
require.NoError(t, err)
defer cleanup()
require.NoError(t, err)

out, err := p.run()
require.NoError(t, err)
Expand Down
43 changes: 34 additions & 9 deletions pkg/dcgmexporter/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,18 @@ func NewMetricsServer(c *Config, metrics chan string, registry *Registry) (*Metr
router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`<html>
_, err := w.Write([]byte(`<html>
<head><title>GPU Exporter</title></head>
<body>
<h1>GPU Exporter</h1>
<p><a href="./metrics">Metrics</a></p>
</body>
</html>`))
if err != nil {
logrus.WithError(err).Error("Failed to write response")
http.Error(w, "Failed to write response", http.StatusInternalServerError)
return
}
})

router.HandleFunc("/health", serverv1.Health)
Expand All @@ -76,7 +81,7 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {
defer httpwg.Done()
logrus.Info("Starting webserver")
if err := web.ListenAndServe(s.server, s.webConfig, logger); err != nil && err != http.ErrServerClosed {
logrus.Fatalf("Failed to Listen and Server HTTP server with err: `%v`", err)
logrus.WithError(err).Fatal("Failed to Listen and Server HTTP server")
}
}()

Expand All @@ -95,33 +100,53 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {

<-stop
if err := s.server.Shutdown(context.Background()); err != nil {
logrus.Fatalf("Failed to shutdown HTTP server, with err: `%v`", err)
logrus.WithError(err).Fatal("Failed to shutdown HTTP server")
}

if err := WaitWithTimeout(&httpwg, 3*time.Second); err != nil {
logrus.Fatalf("Failed waiting for HTTP server to shutdown, with err: `%v`", err)
logrus.WithError(err).Fatal("Failed waiting for HTTP server to shutdown")
}
}

func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
w.Write([]byte(s.getMetrics()))
_, err := w.Write([]byte(s.getMetrics()))
if err != nil {
logrus.WithError(err).Error("Failed to write response")
http.Error(w, "Failed to write response", http.StatusInternalServerError)
return
}
xidMetrics, err := s.registry.Gather()
if err == nil {
encodeXIDMetrics(w, xidMetrics)
if err != nil {
logrus.WithError(err).Error("Failed to write response")
http.Error(w, "Failed to write response", http.StatusInternalServerError)
return
}
err = encodeXIDMetrics(w, xidMetrics)
if err != nil {
http.Error(w, "Failed to write response", http.StatusInternalServerError)
return
}
}

func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) {
if s.getMetrics() == "" {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte("KO"))
_, err := w.Write([]byte("KO"))
if err != nil {
logrus.WithError(err).Error("Failed to write response")
http.Error(w, "Failed to write response", http.StatusInternalServerError)
}
} else {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
_, err := w.Write([]byte("OK"))
if err != nil {
logrus.WithError(err).Error("Failed to write response")
http.Error(w, "Failed to write response", http.StatusInternalServerError)
}
}
}

Expand Down
Loading

0 comments on commit 3250bfe

Please sign in to comment.