Skip to content

Commit

Permalink
Resolved linter-identified issues for improved code quality
Browse files Browse the repository at this point in the history
Signed-off-by: Vadym Fedorov <[email protected]>
  • Loading branch information
nvvfedorov committed Feb 2, 2024
1 parent 7e8e4cb commit f453ec0
Show file tree
Hide file tree
Showing 15 changed files with 559 additions and 152 deletions.
3 changes: 2 additions & 1 deletion .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
"args": [
"-f",
"./etc/default-counters.csv",
"--web-config-file=./tests/integration/testdata/web-config.yml"
"--web-config-file=./tests/integration/testdata/web-config.yml",
"--debug"
]
}
]
Expand Down
22 changes: 22 additions & 0 deletions pkg/cmd/app.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import (

"github.com/NVIDIA/dcgm-exporter/pkg/dcgmexporter"
"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/davecgh/go-spew/spew"
"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
)
Expand Down Expand Up @@ -61,6 +62,7 @@ var (
CLIConfigMapData = "configmap-data"
CLIWebSystemdSocket = "web-systemd-socket"
CLIWebConfigFile = "web-config-file"
CLIDebugMode = "debug"
)

func NewApp(buildVersion ...string) *cli.App {
Expand Down Expand Up @@ -174,6 +176,12 @@ func NewApp(buildVersion ...string) *cli.App {
Usage: "TLS config file following webConfig spec.",
EnvVars: []string{"DCGM_EXPORTER_WEB_CONFIG_FILE"},
},
&cli.BoolFlag{
Name: CLIDebugMode,
Value: false,
Usage: "Enable debug output",
EnvVars: []string{"DCGM_EXPORTER_DEBUG"},
},
}

if runtime.GOOS == "linux" {
Expand Down Expand Up @@ -212,6 +220,19 @@ restart:
return err
}

if config.Debug {
spew.Config.Indent = " "
spew.Config.DisablePointerAddresses = true
spew.Config.DisableCapacities = true
//enable debug logging
logrus.SetLevel(logrus.DebugLevel)
logrus.Debug("Debug output is enabled")
}

logrus.Debugf("Command line: %s", strings.Join(os.Args, " "))

logrus.WithField(dcgmexporter.LoggerDumpField, spew.Sdump(config)).Debug("Loaded configuration")

if config.UseRemoteHE {
logrus.Info("Attemping to connect to remote hostengine at ", config.RemoteHEInfo)
cleanup, err := dcgm.Init(dcgm.Standalone, config.RemoteHEInfo, "0")
Expand Down Expand Up @@ -375,5 +396,6 @@ func contextToConfig(c *cli.Context) (*dcgmexporter.Config, error) {
ConfigMapData: c.String(CLIConfigMapData),
WebSystemdSocket: c.Bool(CLIWebSystemdSocket),
WebConfigFile: c.String(CLIWebConfigFile),
Debug: c.Bool(CLIDebugMode),
}, nil
}
22 changes: 20 additions & 2 deletions pkg/dcgmexporter/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,13 @@ import (
"math/rand"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/sirupsen/logrus"
)

const (
LoggerErrorField = "error"
LoggerGroupIDField = "group_id"
LoggerDumpField = "dump"
)

func NewGroup() (dcgm.GroupHandle, func(), error) {
Expand All @@ -29,7 +36,12 @@ func NewGroup() (dcgm.GroupHandle, func(), error) {
return dcgm.GroupHandle{}, func() {}, err
}

return group, func() { dcgm.DestroyGroup(group) }, nil
return group, func() {
err := dcgm.DestroyGroup(group)
logrus.WithFields(logrus.Fields{
LoggerErrorField: err,
}).Warn("can not destroy group")
}, nil
}

func NewDeviceFields(counters []Counter, entityType dcgm.Field_Entity_Group) []dcgm.Short {
Expand All @@ -56,7 +68,13 @@ func NewFieldGroup(deviceFields []dcgm.Short) (dcgm.FieldHandle, func(), error)
return dcgm.FieldHandle{}, func() {}, err
}

return fieldGroup, func() { dcgm.FieldGroupDestroy(fieldGroup) }, nil
return fieldGroup, func() {
err := dcgm.FieldGroupDestroy(fieldGroup)
logrus.WithFields(logrus.Fields{
LoggerErrorField: err,
}).Warn("can not destroy field group")

}, nil
}

func WatchFieldGroup(group dcgm.GroupHandle, field dcgm.FieldHandle, updateFreq int64, maxKeepAge float64, maxKeepSamples int32) error {
Expand Down
4 changes: 1 addition & 3 deletions pkg/dcgmexporter/gpu_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ func NewDCGMCollector(c []Counter, config *Config, entityType dcgm.Field_Entity_
}

hostname := ""
if config.NoHostname == false {
if !config.NoHostname {
if nodeName := os.Getenv("NODE_NAME"); nodeName != "" {
hostname = nodeName
} else {
Expand Down Expand Up @@ -309,8 +309,6 @@ func ToString(value dcgm.FieldValue_v1) string {
default:
return v
}
default:
return FailedToConvert
}

return FailedToConvert
Expand Down
11 changes: 8 additions & 3 deletions pkg/dcgmexporter/kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/NVIDIA/dcgm-exporter/internal/pkg/nvmlprovider"
"github.com/sirupsen/logrus"
"google.golang.org/grpc"
"google.golang.org/grpc/credentials/insecure"
podresourcesapi "k8s.io/kubelet/pkg/apis/podresources/v1alpha1"
)

Expand Down Expand Up @@ -102,9 +103,13 @@ func connectToServer(socket string) (*grpc.ClientConn, func(), error) {
ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout)
defer cancel()

conn, err := grpc.DialContext(ctx, socket, grpc.WithInsecure(), grpc.WithBlock(),
grpc.WithDialer(func(addr string, timeout time.Duration) (net.Conn, error) {
return net.DialTimeout("unix", addr, timeout)
conn, err := grpc.DialContext(ctx,
socket,
grpc.WithTransportCredentials(insecure.NewCredentials()),
grpc.WithBlock(),
grpc.WithContextDialer(func(ctx context.Context, addr string) (net.Conn, error) {
d := net.Dialer{}
return d.DialContext(ctx, "unix", addr)
}),
)

Expand Down
3 changes: 2 additions & 1 deletion pkg/dcgmexporter/kubernetes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ func StartMockServer(t *testing.T, server *grpc.Server, socket string) func() {
stopped := make(chan interface{})

go func() {
server.Serve(l)
err := server.Serve(l)
assert.NoError(t, err)
close(stopped)
}()

Expand Down
3 changes: 1 addition & 2 deletions pkg/dcgmexporter/parser_test.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package dcgmexporter

import (
"io/ioutil"
"os"
"testing"

Expand Down Expand Up @@ -100,7 +99,7 @@ func TestInvalidConfigMapNamespace(t *testing.T) {
}

func TestExtractCounters(t *testing.T) {
tmpFile, err := ioutil.TempFile(os.TempDir(), "prefix-")
tmpFile, err := os.CreateTemp(os.TempDir(), "prefix-")
if err != nil {
t.Fatalf("Cannot create temporary file: %v", err)
}
Expand Down
3 changes: 3 additions & 0 deletions pkg/dcgmexporter/pipeline.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"time"

"github.com/NVIDIA/go-dcgm/pkg/dcgm"
"github.com/davecgh/go-spew/spew"
"github.com/sirupsen/logrus"
)

Expand All @@ -33,6 +34,8 @@ func NewMetricsPipeline(c *Config, newDCGMCollector DCGMCollectorConstructor) (*
return nil, func() {}, err
}

logrus.WithField(LoggerDumpField, spew.Sdump(counters)).Debug("Counters are initialized")

cleanups := []func(){}
gpuCollector, cleanup, err := newDCGMCollector(counters, c, dcgm.FE_GPU)
if err != nil {
Expand Down
1 change: 1 addition & 0 deletions pkg/dcgmexporter/pipeline_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func TestRun(t *testing.T) {

p, cleanup, err := NewMetricsPipelineWithGPUCollector(&Config{}, c)
defer cleanup()
require.NoError(t, err)

out, err := p.run()
require.NoError(t, err)
Expand Down
22 changes: 18 additions & 4 deletions pkg/dcgmexporter/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,17 @@ func NewMetricsServer(c *Config, metrics chan string) (*MetricsServer, func(), e
router.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
w.Write([]byte(`<html>
_, err := w.Write([]byte(`<html>
<head><title>GPU Exporter</title></head>
<body>
<h1>GPU Exporter</h1>
<p><a href="./metrics">Metrics</a></p>
</body>
</html>`))
if err != nil {
http.Error(w, "Failed to write response", http.StatusInternalServerError)
return
}
})

router.HandleFunc("/health", serverv1.Health)
Expand Down Expand Up @@ -105,18 +109,28 @@ func (s *MetricsServer) Run(stop chan interface{}, wg *sync.WaitGroup) {
func (s *MetricsServer) Metrics(w http.ResponseWriter, r *http.Request) {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
w.Write([]byte(s.getMetrics()))
_, err := w.Write([]byte(s.getMetrics()))
if err != nil {
http.Error(w, "Failed to write response", http.StatusInternalServerError)
return
}
}

func (s *MetricsServer) Health(w http.ResponseWriter, r *http.Request) {
if s.getMetrics() == "" {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusServiceUnavailable)
w.Write([]byte("KO"))
_, err := w.Write([]byte("KO"))
if err != nil {
http.Error(w, "Failed to write response", http.StatusInternalServerError)
}
} else {
w.Header().Set("X-Content-Type-Options", "nosniff")
w.WriteHeader(http.StatusOK)
w.Write([]byte("OK"))
_, err := w.Write([]byte("OK"))
if err != nil {
http.Error(w, "Failed to write response", http.StatusInternalServerError)
}
}
}

Expand Down
Loading

0 comments on commit f453ec0

Please sign in to comment.