Skip to content

Commit

Permalink
backup: Export tenant backup schedule metric
Browse files Browse the repository at this point in the history
Export Cloud tenant backup schedule completion metric with tenant id tag.

Issue: #141167

Epic: None

Release note: None
  • Loading branch information
edwardguo-crl committed Feb 26, 2025
1 parent c693357 commit e30ec71
Show file tree
Hide file tree
Showing 4 changed files with 122 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1623,6 +1623,7 @@
<tr><td>APPLICATION</td><td>rpc.connection.unhealthy_nanos</td><td>Gauge of nanoseconds of unhealthy connection time.<br/><br/>On the prometheus endpoint scraped with the cluster setting &#39;server.child_metrics.enabled&#39; set,<br/>the constituent parts of this metric are available on a per-peer basis and one can read off<br/>for how long a given peer has been unreachable</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.failed</td><td>Number of BACKUP jobs failed</td><td>Jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.last-completed-time</td><td>The unix timestamp of the most recently completed backup by a schedule specified as maintaining this metric</td><td>Jobs</td><td>GAUGE</td><td>TIMESTAMP_SEC</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.last-completed-time-by-virtual_cluster</td><td>The unix timestamp of the most recently completed host scheduled backup by virtual cluster specified as maintaining this metric</td><td>Jobs</td><td>GAUGE</td><td>TIMESTAMP_SEC</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.protected_age_sec</td><td>The age of the oldest PTS record protected by BACKUP schedules</td><td>Seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.protected_record_count</td><td>Number of PTS records held by BACKUP schedules</td><td>Records</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.started</td><td>Number of BACKUP jobs started</td><td>Jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
2 changes: 2 additions & 0 deletions pkg/backup/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ go_test(
"restore_span_covering_test.go",
"restore_test.go",
"revision_reader_test.go",
"schedule_exec_test.go",
"schedule_pts_chaining_test.go",
"show_test.go",
"system_schema_test.go",
Expand Down Expand Up @@ -314,6 +315,7 @@ go_test(
"//pkg/util/log",
"//pkg/util/log/eventpb",
"//pkg/util/log/logpb",
"//pkg/util/metric",
"//pkg/util/mon",
"//pkg/util/protoutil",
"//pkg/util/randutil",
Expand Down
15 changes: 14 additions & 1 deletion pkg/backup/schedule_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ type backupMetrics struct {
*jobs.ExecutorPTSMetrics
// TODO(rui): move this to the backup job so it can be controlled by the
// updates_cluster_monitoring_metrics option.
RpoMetric *metric.Gauge
RpoMetric *metric.Gauge
RpoTenantMetric *metric.GaugeVec
}

var _ metric.Struct = &backupMetrics{}
Expand Down Expand Up @@ -360,6 +361,12 @@ func (e *scheduledBackupExecutor) backupSucceeded(
// for monitoring an RPO SLA, update that metric.
if args.UpdatesLastBackupMetric {
e.metrics.RpoMetric.Update(details.(jobspb.BackupDetails).EndTime.GoTime().Unix())
if details.(jobspb.BackupDetails).SpecificTenantIds != nil {
for _, tenantID := range details.(jobspb.BackupDetails).SpecificTenantIds {
e.metrics.RpoTenantMetric.Update(map[string]string{"tenant_id": tenantID.String()},
details.(jobspb.BackupDetails).EndTime.GoTime().Unix())
}
}
}

if args.UnpauseOnSuccess == jobspb.InvalidScheduleID {
Expand Down Expand Up @@ -578,6 +585,12 @@ func init() {
Measurement: "Jobs",
Unit: metric.Unit_TIMESTAMP_SEC,
}),
RpoTenantMetric: metric.NewExportedGaugeVec(metric.Metadata{
Name: "schedules.BACKUP.last-completed-time-by-virtual_cluster",
Help: "The unix timestamp of the most recently completed host scheduled backup by virtual cluster specified as maintaining this metric",
Measurement: "Jobs",
Unit: metric.Unit_TIMESTAMP_SEC,
}, []string{"tenant_id"}),
},
}, nil
})
Expand Down
105 changes: 105 additions & 0 deletions pkg/backup/schedule_exec_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2025 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package backup

import (
"context"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"testing"

"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/metric"
pbtypes "github.com/gogo/protobuf/types"
"github.com/stretchr/testify/require"
)

func TestBackupSucceededUpdatesMetrics(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()
executor := &scheduledBackupExecutor{
metrics: backupMetrics{
RpoMetric: metric.NewGauge(metric.Metadata{}),
RpoTenantMetric: metric.NewExportedGaugeVec(metric.Metadata{}, []string{"tenant_id"}),
},
}

t.Run("updates RPO metric", func(t *testing.T) {
schedule := createSchedule(t, true)
endTime := hlc.Timestamp{WallTime: hlc.UnixNano()}
details := jobspb.BackupDetails{EndTime: endTime}

err := executor.backupSucceeded(ctx, nil, schedule, details, nil)
require.NoError(t, err)
require.Equal(t, endTime.GoTime().Unix(), executor.metrics.RpoMetric.Value())
})

t.Run("updates RPO tenant metric", func(t *testing.T) {
schedule := createSchedule(t, true)
tenantIDs := mustMakeTenantIDs(t, 1, 2)
endTime := hlc.Timestamp{WallTime: hlc.UnixNano()}
details := jobspb.BackupDetails{
EndTime: endTime,
SpecificTenantIds: tenantIDs,
}

err := executor.backupSucceeded(ctx, nil, schedule, details, nil)
require.NoError(t, err)

expectedTenantIDs := []string{"system", "2"}
verifyRPOTenantMetricLabels(t, executor.metrics.RpoTenantMetric, expectedTenantIDs)
verifyRPOTenantMetricGaugeValue(t, executor.metrics.RpoTenantMetric, details.EndTime)
})
}

func createSchedule(t *testing.T, updatesLastBackupMetric bool) *jobs.ScheduledJob {
schedule := jobs.NewScheduledJob(nil)

args := &backuppb.ScheduledBackupExecutionArgs{
UpdatesLastBackupMetric: updatesLastBackupMetric,
}
any, err := pbtypes.MarshalAny(args)
require.NoError(t, err)
schedule.SetExecutionDetails(schedule.ExecutorType(), jobspb.ExecutionArguments{Args: any})
return schedule
}

func mustMakeTenantIDs(t *testing.T, ids ...int) []roachpb.TenantID {
var tenantIDs []roachpb.TenantID
for _, id := range ids {
tid, err := roachpb.MakeTenantID(uint64(id))
require.NoError(t, err)
tenantIDs = append(tenantIDs, tid)
}
return tenantIDs
}

func verifyRPOTenantMetricLabels(
t *testing.T, metric *metric.GaugeVec, expectedTenantIDs []string,
) {
prometheusMetrics := metric.ToPrometheusMetrics()
var actualTenantIDs []string
for _, promMetric := range prometheusMetrics {
labels := promMetric.GetLabel()
for _, label := range labels {
if label.GetName() == "tenant_id" {
actualTenantIDs = append(actualTenantIDs, label.GetValue())
}
}
}
require.ElementsMatch(t, expectedTenantIDs, actualTenantIDs)
}

func verifyRPOTenantMetricGaugeValue(t *testing.T, metric *metric.GaugeVec, endTime hlc.Timestamp) {
prometheusMetrics := metric.ToPrometheusMetrics()
for _, promMetric := range prometheusMetrics {
value := promMetric.Gauge.GetValue()
require.Equal(t, float64(endTime.GoTime().Unix()), value)
}
}

0 comments on commit e30ec71

Please sign in to comment.