diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index a344c7ceac0..a918b13c95e 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1623,6 +1623,7 @@ APPLICATIONrpc.connection.unhealthy_nanosGauge of nanoseconds of unhealthy connection time.

On the prometheus endpoint scraped with the cluster setting 'server.child_metrics.enabled' set,
the constituent parts of this metric are available on a per-peer basis and one can read off
for how long a given peer has been unreachableNanosecondsGAUGENANOSECONDSAVGNONE APPLICATIONschedules.BACKUP.failedNumber of BACKUP jobs failedJobsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONschedules.BACKUP.last-completed-timeThe unix timestamp of the most recently completed backup by a schedule specified as maintaining this metricJobsGAUGETIMESTAMP_SECAVGNONE +APPLICATIONschedules.BACKUP.last-completed-time-by-virtual_clusterThe unix timestamp of the most recently completed host scheduled backup by virtual cluster specified as maintaining this metricJobsGAUGETIMESTAMP_SECAVGNONE APPLICATIONschedules.BACKUP.protected_age_secThe age of the oldest PTS record protected by BACKUP schedulesSecondsGAUGESECONDSAVGNONE APPLICATIONschedules.BACKUP.protected_record_countNumber of PTS records held by BACKUP schedulesRecordsGAUGECOUNTAVGNONE APPLICATIONschedules.BACKUP.startedNumber of BACKUP jobs startedJobsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/backup/BUILD.bazel b/pkg/backup/BUILD.bazel index 74ecc017e42..ebd243e860a 100644 --- a/pkg/backup/BUILD.bazel +++ b/pkg/backup/BUILD.bazel @@ -195,6 +195,7 @@ go_test( "restore_span_covering_test.go", "restore_test.go", "revision_reader_test.go", + "schedule_exec_test.go", "schedule_pts_chaining_test.go", "show_test.go", "system_schema_test.go", @@ -314,6 +315,7 @@ go_test( "//pkg/util/log", "//pkg/util/log/eventpb", "//pkg/util/log/logpb", + "//pkg/util/metric", "//pkg/util/mon", "//pkg/util/protoutil", "//pkg/util/randutil", diff --git a/pkg/backup/schedule_exec.go b/pkg/backup/schedule_exec.go index 72ca4995983..6a15412ae29 100644 --- a/pkg/backup/schedule_exec.go +++ b/pkg/backup/schedule_exec.go @@ -37,7 +37,8 @@ type backupMetrics struct { *jobs.ExecutorPTSMetrics // TODO(rui): move this to the backup job so it can be controlled by the // updates_cluster_monitoring_metrics option. - RpoMetric *metric.Gauge + RpoMetric *metric.Gauge + RpoTenantMetric *metric.GaugeVec } var _ metric.Struct = &backupMetrics{} @@ -360,6 +361,12 @@ func (e *scheduledBackupExecutor) backupSucceeded( // for monitoring an RPO SLA, update that metric. if args.UpdatesLastBackupMetric { e.metrics.RpoMetric.Update(details.(jobspb.BackupDetails).EndTime.GoTime().Unix()) + if details.(jobspb.BackupDetails).SpecificTenantIds != nil { + for _, tenantID := range details.(jobspb.BackupDetails).SpecificTenantIds { + e.metrics.RpoTenantMetric.Update(map[string]string{"tenant_id": tenantID.String()}, + details.(jobspb.BackupDetails).EndTime.GoTime().Unix()) + } + } } if args.UnpauseOnSuccess == jobspb.InvalidScheduleID { @@ -578,6 +585,12 @@ func init() { Measurement: "Jobs", Unit: metric.Unit_TIMESTAMP_SEC, }), + RpoTenantMetric: metric.NewExportedGaugeVec(metric.Metadata{ + Name: "schedules.BACKUP.last-completed-time-by-virtual_cluster", + Help: "The unix timestamp of the most recently completed host scheduled backup by virtual cluster specified as maintaining this metric", + Measurement: "Jobs", + Unit: metric.Unit_TIMESTAMP_SEC, + }, []string{"tenant_id"}), }, }, nil }) diff --git a/pkg/backup/schedule_exec_test.go b/pkg/backup/schedule_exec_test.go new file mode 100644 index 00000000000..91ec69188b6 --- /dev/null +++ b/pkg/backup/schedule_exec_test.go @@ -0,0 +1,105 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package backup + +import ( + "context" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "testing" + + "github.com/cockroachdb/cockroach/pkg/backup/backuppb" + "github.com/cockroachdb/cockroach/pkg/jobs" + "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/metric" + pbtypes "github.com/gogo/protobuf/types" + "github.com/stretchr/testify/require" +) + +func TestBackupSucceededUpdatesMetrics(t *testing.T) { + defer leaktest.AfterTest(t)() + ctx := context.Background() + executor := &scheduledBackupExecutor{ + metrics: backupMetrics{ + RpoMetric: metric.NewGauge(metric.Metadata{}), + RpoTenantMetric: metric.NewExportedGaugeVec(metric.Metadata{}, []string{"tenant_id"}), + }, + } + + t.Run("updates RPO metric", func(t *testing.T) { + schedule := createSchedule(t, true) + endTime := hlc.Timestamp{WallTime: hlc.UnixNano()} + details := jobspb.BackupDetails{EndTime: endTime} + + err := executor.backupSucceeded(ctx, nil, schedule, details, nil) + require.NoError(t, err) + require.Equal(t, endTime.GoTime().Unix(), executor.metrics.RpoMetric.Value()) + }) + + t.Run("updates RPO tenant metric", func(t *testing.T) { + schedule := createSchedule(t, true) + tenantIDs := mustMakeTenantIDs(t, 1, 2) + endTime := hlc.Timestamp{WallTime: hlc.UnixNano()} + details := jobspb.BackupDetails{ + EndTime: endTime, + SpecificTenantIds: tenantIDs, + } + + err := executor.backupSucceeded(ctx, nil, schedule, details, nil) + require.NoError(t, err) + + expectedTenantIDs := []string{"system", "2"} + verifyRPOTenantMetricLabels(t, executor.metrics.RpoTenantMetric, expectedTenantIDs) + verifyRPOTenantMetricGaugeValue(t, executor.metrics.RpoTenantMetric, details.EndTime) + }) +} + +func createSchedule(t *testing.T, updatesLastBackupMetric bool) *jobs.ScheduledJob { + schedule := jobs.NewScheduledJob(nil) + + args := &backuppb.ScheduledBackupExecutionArgs{ + UpdatesLastBackupMetric: updatesLastBackupMetric, + } + any, err := pbtypes.MarshalAny(args) + require.NoError(t, err) + schedule.SetExecutionDetails(schedule.ExecutorType(), jobspb.ExecutionArguments{Args: any}) + return schedule +} + +func mustMakeTenantIDs(t *testing.T, ids ...int) []roachpb.TenantID { + var tenantIDs []roachpb.TenantID + for _, id := range ids { + tid, err := roachpb.MakeTenantID(uint64(id)) + require.NoError(t, err) + tenantIDs = append(tenantIDs, tid) + } + return tenantIDs +} + +func verifyRPOTenantMetricLabels( + t *testing.T, metric *metric.GaugeVec, expectedTenantIDs []string, +) { + prometheusMetrics := metric.ToPrometheusMetrics() + var actualTenantIDs []string + for _, promMetric := range prometheusMetrics { + labels := promMetric.GetLabel() + for _, label := range labels { + if label.GetName() == "tenant_id" { + actualTenantIDs = append(actualTenantIDs, label.GetValue()) + } + } + } + require.ElementsMatch(t, expectedTenantIDs, actualTenantIDs) +} + +func verifyRPOTenantMetricGaugeValue(t *testing.T, metric *metric.GaugeVec, endTime hlc.Timestamp) { + prometheusMetrics := metric.ToPrometheusMetrics() + for _, promMetric := range prometheusMetrics { + value := promMetric.Gauge.GetValue() + require.Equal(t, float64(endTime.GoTime().Unix()), value) + } +}