Skip to content

Commit

Permalink
Accommodate metric and label renames.
Browse files Browse the repository at this point in the history
  • Loading branch information
igorpeshansky committed Jun 25, 2024
1 parent e384d80 commit da2011d
Show file tree
Hide file tree
Showing 6 changed files with 228 additions and 27 deletions.
79 changes: 69 additions & 10 deletions apps/dcgm.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,36 @@ func (r MetricsReceiverDcgm) Pipelines(_ context.Context) ([]otel.ReceiverPipeli
},
},
Processors: map[string][]otel.Component{"metrics": {
otel.MetricsTransform(
otel.UpdateMetric(
"gpu.dcgm.pipe.utilization",
otel.RenameLabel("gpu.pipe", "pipe"),
),
otel.UpdateMetric(
"gpu.dcgm.memory.bytes_used",
otel.RenameLabel("gpu.memory.state", "state"),
),
otel.UpdateMetric(
"gpu.dcgm.nvlink.io",
otel.RenameLabel("network.io.direction", "direction"),
),
otel.UpdateMetric(
"gpu.dcgm.pcie.io",
otel.RenameLabel("network.io.direction", "direction"),
),
otel.UpdateMetric(
"gpu.dcgm.clock.throttle_duration.time",
otel.RenameLabel("gpu.clock.violation", "violation"),
),
otel.UpdateMetric(
"gpu.dcgm.ecc_errors",
otel.RenameLabel("gpu.error.type", "error_type"),
),
otel.UpdateMetric(
"gpu.dcgm.xid_errors",
otel.RenameLabel("gpu.error.xid", "xid"),
),
),
otel.MetricsTransform(
otel.AddPrefix("workload.googleapis.com"),
),
Expand All @@ -72,31 +102,49 @@ func (r MetricsReceiverDcgm) Pipelines(_ context.Context) ([]otel.ReceiverPipeli
"collection_interval": r.CollectionIntervalString(),
"endpoint": r.Endpoint,
"metrics": map[string]interface{}{
"gpu.dcgm.clock.frequency": map[string]bool{
"gpu.dcgm.utilization": map[string]bool{
"enabled": false,
},
"gpu.dcgm.clock.throttle_duration.time": map[string]bool{
"gpu.dcgm.sm.utilization": map[string]bool{
"enabled": true,
},
"gpu.dcgm.sm.occupancy": map[string]bool{
"enabled": true,
},
"gpu.dcgm.pipe.utilization": map[string]bool{
"enabled": true,
},
"gpu.dcgm.codec.encoder.utilization": map[string]bool{
"enabled": false,
},
"gpu.dcgm.codec.decoder.utilization": map[string]bool{
"enabled": false,
},
"gpu.dcgm.codec.encoder.utilization": map[string]bool{
"gpu.dcgm.memory.bytes_used": map[string]bool{
"enabled": false,
},
"gpu.dcgm.ecc_errors": map[string]bool{
"enabled": false,
"gpu.dcgm.memory.bandwidth_utilization": map[string]bool{
"enabled": true,
},
"gpu.dcgm.pcie.io": map[string]bool{
"enabled": true,
},
"gpu.dcgm.nvlink.io": map[string]bool{
"enabled": true,
},
"gpu.dcgm.energy_consumption": map[string]bool{
"enabled": false,
},
"gpu.dcgm.memory.bytes_used": map[string]bool{
"gpu.dcgm.temperature": map[string]bool{
"enabled": false,
},
"gpu.dcgm.temperature": map[string]bool{
"gpu.dcgm.clock.frequency": map[string]bool{
"enabled": false,
},
"gpu.dcgm.utilization": map[string]bool{
"gpu.dcgm.clock.throttle_duration.time": map[string]bool{
"enabled": false,
},
"gpu.dcgm.ecc_errors": map[string]bool{
"enabled": false,
},
"gpu.dcgm.xid_errors": map[string]bool{
Expand All @@ -112,16 +160,27 @@ func (r MetricsReceiverDcgm) Pipelines(_ context.Context) ([]otel.ReceiverPipeli
"dcgm.gpu.profiling.dram_utilization",
),
otel.RenameMetric(
"gpu.dcgm.nvlink.traffic",
"gpu.dcgm.nvlink.io",
"dcgm.gpu.profiling.nvlink_traffic_rate",
otel.RenameLabel("network.io.direction", "direction"),
otel.RenameLabelValues("direction", map[string]string{
"receive": "rx",
"transmit": "tx",
}),
),
otel.RenameMetric(
"gpu.dcgm.pcie.traffic",
"gpu.dcgm.pcie.io",
"dcgm.gpu.profiling.pcie_traffic_rate",
otel.RenameLabel("network.io.direction", "direction"),
otel.RenameLabelValues("direction", map[string]string{
"receive": "rx",
"transmit": "tx",
}),
),
otel.RenameMetric(
"gpu.dcgm.pipe.utilization",
"dcgm.gpu.profiling.pipe_utilization",
otel.RenameLabel("gpu.pipe", "pipe"),
),
otel.RenameMetric(
"gpu.dcgm.sm.occupancy",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,14 +67,40 @@ processors:
include: gpu.dcgm.memory.bandwidth_utilization
new_name: dcgm.gpu.profiling.dram_utilization
- action: update
include: gpu.dcgm.nvlink.traffic
include: gpu.dcgm.nvlink.io
new_name: dcgm.gpu.profiling.nvlink_traffic_rate
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update_label
label: direction
value_actions:
- new_value: rx
value: receive
- new_value: tx
value: transmit
- action: update
include: gpu.dcgm.pcie.traffic
include: gpu.dcgm.pcie.io
new_name: dcgm.gpu.profiling.pcie_traffic_rate
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update_label
label: direction
value_actions:
- new_value: rx
value: receive
- new_value: tx
value: transmit
- action: update
include: gpu.dcgm.pipe.utilization
new_name: dcgm.gpu.profiling.pipe_utilization
operations:
- action: update_label
label: gpu.pipe
new_label: pipe
- action: update
include: gpu.dcgm.sm.occupancy
new_name: dcgm.gpu.profiling.sm_occupancy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,40 @@ processors:
include: gpu.dcgm.memory.bandwidth_utilization
new_name: dcgm.gpu.profiling.dram_utilization
- action: update
include: gpu.dcgm.nvlink.traffic
include: gpu.dcgm.nvlink.io
new_name: dcgm.gpu.profiling.nvlink_traffic_rate
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update_label
label: direction
value_actions:
- new_value: rx
value: receive
- new_value: tx
value: transmit
- action: update
include: gpu.dcgm.pcie.traffic
include: gpu.dcgm.pcie.io
new_name: dcgm.gpu.profiling.pcie_traffic_rate
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update_label
label: direction
value_actions:
- new_value: rx
value: receive
- new_value: tx
value: transmit
- action: update
include: gpu.dcgm.pipe.utilization
new_name: dcgm.gpu.profiling.pipe_utilization
operations:
- action: update_label
label: gpu.pipe
new_label: pipe
- action: update
include: gpu.dcgm.sm.occupancy
new_name: dcgm.gpu.profiling.sm_occupancy
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,50 @@ processors:
- otelcol_grpc_io_client_completed_rpcs
- otelcol_googlecloudmonitoring_point_count
metricstransform/dcgm_0:
transforms:
- action: update
include: gpu.dcgm.pipe.utilization
operations:
- action: update_label
label: gpu.pipe
new_label: pipe
- action: update
include: gpu.dcgm.memory.bytes_used
operations:
- action: update_label
label: gpu.memory.state
new_label: state
- action: update
include: gpu.dcgm.nvlink.io
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update
include: gpu.dcgm.pcie.io
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update
include: gpu.dcgm.clock.throttle_duration.time
operations:
- action: update_label
label: gpu.clock.violation
new_label: violation
- action: update
include: gpu.dcgm.ecc_errors
operations:
- action: update_label
label: gpu.error.type
new_label: error_type
- action: update
include: gpu.dcgm.xid_errors
operations:
- action: update_label
label: gpu.error.xid
new_label: xid
metricstransform/dcgm_1:
transforms:
- action: update
include: ^(.*)$$
Expand Down Expand Up @@ -461,13 +505,13 @@ processors:
include: ^(.*)$$
match_type: regexp
new_name: agent.googleapis.com/$${1}
modifyscope/dcgm_2:
modifyscope/dcgm_3:
override_scope_name: agent.googleapis.com/dcgm
override_scope_version: "2.0"
resourcedetection/_global_0:
detectors:
- gcp
transform/dcgm_1:
transform/dcgm_2:
metric_statements:
context: datapoint
statements:
Expand Down Expand Up @@ -517,8 +561,9 @@ service:
- googlecloud/otel
processors:
- metricstransform/dcgm_0
- transform/dcgm_1
- modifyscope/dcgm_2
- metricstransform/dcgm_1
- transform/dcgm_2
- modifyscope/dcgm_3
- resourcedetection/_global_0
receivers:
- dcgm/dcgm
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,50 @@ processors:
- otelcol_grpc_io_client_completed_rpcs
- otelcol_googlecloudmonitoring_point_count
metricstransform/dcgm_0:
transforms:
- action: update
include: gpu.dcgm.pipe.utilization
operations:
- action: update_label
label: gpu.pipe
new_label: pipe
- action: update
include: gpu.dcgm.memory.bytes_used
operations:
- action: update_label
label: gpu.memory.state
new_label: state
- action: update
include: gpu.dcgm.nvlink.io
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update
include: gpu.dcgm.pcie.io
operations:
- action: update_label
label: network.io.direction
new_label: direction
- action: update
include: gpu.dcgm.clock.throttle_duration.time
operations:
- action: update_label
label: gpu.clock.violation
new_label: violation
- action: update
include: gpu.dcgm.ecc_errors
operations:
- action: update_label
label: gpu.error.type
new_label: error_type
- action: update
include: gpu.dcgm.xid_errors
operations:
- action: update_label
label: gpu.error.xid
new_label: xid
metricstransform/dcgm_1:
transforms:
- action: update
include: ^(.*)$$
Expand Down Expand Up @@ -432,13 +476,13 @@ processors:
include: ^(.*)$$
match_type: regexp
new_name: agent.googleapis.com/$${1}
modifyscope/dcgm_2:
modifyscope/dcgm_3:
override_scope_name: agent.googleapis.com/dcgm
override_scope_version: "2.0"
resourcedetection/_global_0:
detectors:
- gcp
transform/dcgm_1:
transform/dcgm_2:
metric_statements:
context: datapoint
statements:
Expand Down Expand Up @@ -486,8 +530,9 @@ service:
- googlecloud/otel
processors:
- metricstransform/dcgm_0
- transform/dcgm_1
- modifyscope/dcgm_2
- metricstransform/dcgm_1
- transform/dcgm_2
- modifyscope/dcgm_3
- resourcedetection/_global_0
receivers:
- dcgm/dcgm
Expand Down
Loading

0 comments on commit da2011d

Please sign in to comment.