Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

backup: Export tenant backup schedule metric #141156

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1623,6 +1623,7 @@
<tr><td>APPLICATION</td><td>rpc.connection.unhealthy_nanos</td><td>Gauge of nanoseconds of unhealthy connection time.<br/><br/>On the prometheus endpoint scraped with the cluster setting &#39;server.child_metrics.enabled&#39; set,<br/>the constituent parts of this metric are available on a per-peer basis and one can read off<br/>for how long a given peer has been unreachable</td><td>Nanoseconds</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.failed</td><td>Number of BACKUP jobs failed</td><td>Jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.last-completed-time</td><td>The unix timestamp of the most recently completed backup by a schedule specified as maintaining this metric</td><td>Jobs</td><td>GAUGE</td><td>TIMESTAMP_SEC</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.last-completed-time-by-virtual_cluster</td><td>The unix timestamp of the most recently completed host scheduled backup by virtual cluster specified as maintaining this metric</td><td>Jobs</td><td>GAUGE</td><td>TIMESTAMP_SEC</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.protected_age_sec</td><td>The age of the oldest PTS record protected by BACKUP schedules</td><td>Seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.protected_record_count</td><td>Number of PTS records held by BACKUP schedules</td><td>Records</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>schedules.BACKUP.started</td><td>Number of BACKUP jobs started</td><td>Jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
2 changes: 2 additions & 0 deletions pkg/backup/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ go_test(
"restore_span_covering_test.go",
"restore_test.go",
"revision_reader_test.go",
"schedule_exec_test.go",
"schedule_pts_chaining_test.go",
"show_test.go",
"system_schema_test.go",
Expand Down Expand Up @@ -314,6 +315,7 @@ go_test(
"//pkg/util/log",
"//pkg/util/log/eventpb",
"//pkg/util/log/logpb",
"//pkg/util/metric",
"//pkg/util/mon",
"//pkg/util/protoutil",
"//pkg/util/randutil",
Expand Down
15 changes: 14 additions & 1 deletion pkg/backup/schedule_exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ type backupMetrics struct {
*jobs.ExecutorPTSMetrics
// TODO(rui): move this to the backup job so it can be controlled by the
// updates_cluster_monitoring_metrics option.
RpoMetric *metric.Gauge
RpoMetric *metric.Gauge
RpoTenantMetric *metric.GaugeVec
}

var _ metric.Struct = &backupMetrics{}
Expand Down Expand Up @@ -360,6 +361,12 @@ func (e *scheduledBackupExecutor) backupSucceeded(
// for monitoring an RPO SLA, update that metric.
if args.UpdatesLastBackupMetric {
e.metrics.RpoMetric.Update(details.(jobspb.BackupDetails).EndTime.GoTime().Unix())
if details.(jobspb.BackupDetails).SpecificTenantIds != nil {
for _, tenantID := range details.(jobspb.BackupDetails).SpecificTenantIds {
e.metrics.RpoTenantMetric.Update(map[string]string{"tenant_id": tenantID.String()},
details.(jobspb.BackupDetails).EndTime.GoTime().Unix())
}
}
}

if args.UnpauseOnSuccess == jobspb.InvalidScheduleID {
Expand Down Expand Up @@ -578,6 +585,12 @@ func init() {
Measurement: "Jobs",
Unit: metric.Unit_TIMESTAMP_SEC,
}),
RpoTenantMetric: metric.NewExportedGaugeVec(metric.Metadata{
Name: "schedules.BACKUP.last-completed-time-by-virtual_cluster",
Help: "The unix timestamp of the most recently completed host scheduled backup by virtual cluster specified as maintaining this metric",
Measurement: "Jobs",
Unit: metric.Unit_TIMESTAMP_SEC,
}, []string{"tenant_id"}),
},
}, nil
})
Expand Down
105 changes: 105 additions & 0 deletions pkg/backup/schedule_exec_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright 2025 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package backup

import (
"context"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"testing"

"github.com/cockroachdb/cockroach/pkg/backup/backuppb"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/metric"
pbtypes "github.com/gogo/protobuf/types"
"github.com/stretchr/testify/require"
)

func TestBackupSucceededUpdatesMetrics(t *testing.T) {
defer leaktest.AfterTest(t)()
ctx := context.Background()
executor := &scheduledBackupExecutor{
metrics: backupMetrics{
RpoMetric: metric.NewGauge(metric.Metadata{}),
RpoTenantMetric: metric.NewExportedGaugeVec(metric.Metadata{}, []string{"tenant_id"}),
},
}

t.Run("updates RPO metric", func(t *testing.T) {
schedule := createSchedule(t, true)
endTime := hlc.Timestamp{WallTime: hlc.UnixNano()}
details := jobspb.BackupDetails{EndTime: endTime}

err := executor.backupSucceeded(ctx, nil, schedule, details, nil)
require.NoError(t, err)
require.Equal(t, endTime.GoTime().Unix(), executor.metrics.RpoMetric.Value())
})

t.Run("updates RPO tenant metric", func(t *testing.T) {
schedule := createSchedule(t, true)
tenantIDs := mustMakeTenantIDs(t, 1, 2)
endTime := hlc.Timestamp{WallTime: hlc.UnixNano()}
details := jobspb.BackupDetails{
EndTime: endTime,
SpecificTenantIds: tenantIDs,
}

err := executor.backupSucceeded(ctx, nil, schedule, details, nil)
require.NoError(t, err)

expectedTenantIDs := []string{"system", "2"}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: also verify that executor.metrics.RpoMetric.Value() was also updated in this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think there is a Value() method in RpoMetric. I verified the Gauge metric value in a separate function.

verifyRPOTenantMetricLabels(t, executor.metrics.RpoTenantMetric, expectedTenantIDs)
verifyRPOTenantMetricGaugeValue(t, executor.metrics.RpoTenantMetric, details.EndTime)
})
}

func createSchedule(t *testing.T, updatesLastBackupMetric bool) *jobs.ScheduledJob {
schedule := jobs.NewScheduledJob(nil)

args := &backuppb.ScheduledBackupExecutionArgs{
UpdatesLastBackupMetric: updatesLastBackupMetric,
}
any, err := pbtypes.MarshalAny(args)
require.NoError(t, err)
schedule.SetExecutionDetails(schedule.ExecutorType(), jobspb.ExecutionArguments{Args: any})
return schedule
}

func mustMakeTenantIDs(t *testing.T, ids ...int) []roachpb.TenantID {
var tenantIDs []roachpb.TenantID
for _, id := range ids {
tid, err := roachpb.MakeTenantID(uint64(id))
require.NoError(t, err)
tenantIDs = append(tenantIDs, tid)
}
return tenantIDs
}

func verifyRPOTenantMetricLabels(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could you also pass in and verify the expected end time?

t *testing.T, metric *metric.GaugeVec, expectedTenantIDs []string,
) {
prometheusMetrics := metric.ToPrometheusMetrics()
var actualTenantIDs []string
for _, promMetric := range prometheusMetrics {
labels := promMetric.GetLabel()
for _, label := range labels {
if label.GetName() == "tenant_id" {
actualTenantIDs = append(actualTenantIDs, label.GetValue())
}
}
}
require.ElementsMatch(t, expectedTenantIDs, actualTenantIDs)
}

func verifyRPOTenantMetricGaugeValue(t *testing.T, metric *metric.GaugeVec, endTime hlc.Timestamp) {
prometheusMetrics := metric.ToPrometheusMetrics()
for _, promMetric := range prometheusMetrics {
value := promMetric.Gauge.GetValue()
require.Equal(t, float64(endTime.GoTime().Unix()), value)
}
}
Loading