Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add --cleanup-on-exit #64

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changes/unreleased/Added-20250303-125045.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kind: Added
body: New --cleanup-on-exit flag for `ydbops restart`. If specified, will intercept SIGTERM\SIGINT, safely finish current CMS batch and exit, cleaning up request
time: 2025-03-03T12:50:45.370454584+01:00
3 changes: 3 additions & 0 deletions .changes/unreleased/Removed-20250303-132624.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
kind: Removed
body: Removed an unimplemented --continue flag. Turned out it's not necessary, filtering by uptime etc is expressive and much simpler
time: 2025-03-03T13:26:24.60035851+01:00
29 changes: 4 additions & 25 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ run:
# output configuration options
output:
# colored-line-number|line-number|json|tab|checkstyle, default is "colored-line-number"
formats: colored-line-number
formats:
- format: colored-line-number

# print lines of code with issue, default is true
print-issued-lines: true
Expand All @@ -43,12 +44,8 @@ linters-settings:
# default is false: such cases aren't reported by default.
check-blank: false
govet:
# report about shadowed variables
check-shadowing: false
fieldalignment: true
golint:
# minimal confidence for issues, default is 0.8
min-confidence: 0.8
enable:
- shadow # report about shadowed variables
gofmt:
# simplify code: gofmt with `-s` option, true by default
simplify: true
Expand All @@ -63,9 +60,6 @@ linters-settings:
min-len: 2
# minimal occurrences count to trigger, 3 by default
min-occurrences: 2
fieldalignment:
# print struct with more effective memory layout or not, false by default
suggest-new: true
misspell:
# Correct spellings using locale preferences for US or UK.
# Default is to use a neutral variety of English.
Expand Down Expand Up @@ -93,17 +87,7 @@ linters-settings:
- name: empty-block
- name: superfluous-else
- name: unreachable-code
unused:
# treat code as a program (not a library) and report unused exported identifiers; default is false.
# XXX: if you enable this setting, unused will report a lot of false-positives in text editors:
# if it's called for subdir of a project it can't find funcs usages. All text editor integrations
# with golangci-lint call it on a directory with the changed file.
check-exported: false
unparam:
# call graph construction algorithm (cha, rta). In general, use cha for libraries,
# and rta for programs with main packages. Default is cha.
algo: cha

# Inspect exported functions, default is false. Set to true if no external program/library imports your code.
# XXX: if you enable this setting, unparam will report a lot of false-positives in text editors:
# if it's called for subdir of a project it can't find external interfaces. All text editor integrations
Expand Down Expand Up @@ -173,9 +157,6 @@ linters-settings:
# The flag is passed to the ruleguard 'debug-group' argument.
# Default: ""
debug: 'emptyDecl'
# Deprecated, use 'failOn' param.
# If set to true, identical to failOn='all', otherwise failOn=''
failOnError: false
# Determines the behavior when an error occurs while parsing ruleguard files.
# If flag is not set, log error and skip rule files that contain an error.
# If flag is set, the value must be a comma-separated list of error conditions.
Expand Down Expand Up @@ -290,8 +271,6 @@ issues:
# Default value for this option is true.
exclude-use-default: true

# Maximum issues count per one linter. Set to 0 to disable. Default is 50.
max-per-linter: 0

# Maximum count of issues with the same text. Set to 0 to disable. Default is 3.
max-same-issues: 0
Expand Down
11 changes: 4 additions & 7 deletions pkg/rolling/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,10 @@ type RestartOptions struct {
NodesInflight int
DelayBetweenRestarts time.Duration
SuppressCompatibilityCheck bool
CleanupOnExit bool

RestartDuration int

Continue bool

SSHArgs []string

CustomSystemdUnitName string
Expand Down Expand Up @@ -81,11 +80,6 @@ Examples:
fs.IntVar(&o.CMSQueryInterval, "cms-query-interval", DefaultCMSQueryIntervalSeconds,
fmt.Sprintf("How often to query CMS while waiting for new permissions %v", DefaultCMSQueryIntervalSeconds))

fs.BoolVar(&o.Continue, "continue", false,
`Attempt to continue previous rolling restart, if there was one. The set of selected nodes
for this invocation must be the same as for the previous invocation, and this can not be verified at runtime since
the ydbops utility is stateless. Use at your own risk.`)

fs.IntVar(&o.RestartDuration, "duration", DefaultRestartDurationSeconds,
`CMS will release the node for maintenance for duration * restart-retry-number seconds. Any maintenance
after that would be considered a regular cluster failure`)
Expand All @@ -99,6 +93,9 @@ ydbops will try to figure out if you broke this rule by comparing before\after o

fs.DurationVar(&o.DelayBetweenRestarts, "delay-between-restarts", DefaultDelayBetweenRestarts,
`Delay between two consecutive restarts. E.g. '60s', '2m'. The number of simultaneous restarts is limited by 'nodes-inflight'.`)

fs.BoolVar(&o.CleanupOnExit, "cleanup-on-exit", true,
`When enabled, attempt to drop the maintenance task if the utility is killed by SIGTERM.`)
}

func (o *RestartOptions) GetRestartDuration() *durationpb.Duration {
Expand Down
73 changes: 54 additions & 19 deletions pkg/rolling/rolling.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,15 @@ package rolling

import (
"bytes"
"context"
"errors"
"fmt"
"math"
"os"
"os/signal"
"strings"
"sync"
"syscall"
"time"

"github.com/google/uuid"
Expand Down Expand Up @@ -91,16 +95,35 @@ func (e *executer) Execute() error {
restarter: e.restarter,
}

var err error
if e.opts.Continue {
e.logger.Info("Continue previous rolling restart")
err = r.DoRestartPrevious()
} else {
e.logger.Info("Start rolling restart")
err = r.DoRestart()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

if e.opts.CleanupOnExit {
sigCh := make(chan os.Signal, 1)
signal.Notify(sigCh, syscall.SIGINT, syscall.SIGTERM)

e.logger.Infof("Set up signal interception for SIGINT and SIGTERM")

go func() {
sig := <-sigCh
e.logger.Infof("Received signal: %v, canceling operations", sig)
cancel()
}()
}

if err != nil {
e.logger.Info("Start rolling restart")
err := r.DoRestart(ctx)

if errors.Is(err, context.Canceled) && e.opts.CleanupOnExit {
e.logger.Info("Operation was cancelled, cleaning up maintenance tasks")

if cleanupErr := r.cleanupRollingRestart(); cleanupErr != nil {
e.logger.Errorf("Failed to cleanup maintenance tasks on signal: %v", cleanupErr)
} else {
e.logger.Info("Successfully cleaned up maintenance tasks")
}
return err
} else if err != nil {
e.logger.Errorf("Failed to complete restart: %+v", err)
return err
}
Expand All @@ -109,14 +132,14 @@ func (e *executer) Execute() error {
return nil
}

func (r *Rolling) DoRestart() error {
func (r *Rolling) DoRestart(ctx context.Context) error {
state, err := r.prepareState()
if err != nil {
return err
}
r.state = state

if err = r.cleanupOldRollingRestarts(); err != nil {
if err = r.cleanupRollingRestart(); err != nil {
return err
}

Expand Down Expand Up @@ -177,14 +200,10 @@ func (r *Rolling) DoRestart() error {
return fmt.Errorf("failed to create maintenance task: %w", err)
}

return r.cmsWaitingLoop(task, len(nodesToRestart))
return r.cmsWaitingLoop(ctx, task, len(nodesToRestart))
}

func (r *Rolling) DoRestartPrevious() error {
return fmt.Errorf("--continue behavior not implemented yet")
}

func (r *Rolling) cmsWaitingLoop(task cms.MaintenanceTask, totalNodes int) error {
func (r *Rolling) cmsWaitingLoop(ctx context.Context, task cms.MaintenanceTask, totalNodes int) error {
var (
err error
delay time.Duration
Expand Down Expand Up @@ -216,7 +235,10 @@ func (r *Rolling) cmsWaitingLoop(task cms.MaintenanceTask, totalNodes int) error
}

r.logger.Infof("Wait next %s delay. Total node progress: %v out of %v", delay, restartedNodes, totalNodes)
time.Sleep(delay)

if err = waitOrCancel(ctx, delay); err != nil {
return err
}

r.logger.Infof("Refresh maintenance task with id: %s", taskID)
task, err = r.cms.RefreshMaintenanceTask(taskID)
Expand Down Expand Up @@ -438,8 +460,8 @@ func (r *Rolling) prepareState() (*state, error) {
}, nil
}

func (r *Rolling) cleanupOldRollingRestarts() error {
r.logger.Debugf("Will cleanup all previous maintenance tasks...")
func (r *Rolling) cleanupRollingRestart() error {
r.logger.Debugf("Will cleanup all maintenance tasks...")

previousTasks, err := r.cms.MaintenanceTasks(r.state.userSID)
if err != nil {
Expand Down Expand Up @@ -549,3 +571,16 @@ func (r *Rolling) logCompleteResult(result *Ydb_Maintenance.ManageActionResult)

r.logger.Debugf("Manage action result:\n%s", prettyprint.ResultToString(result))
}

func waitOrCancel(ctx context.Context, delay time.Duration) error {
timer := time.NewTimer(delay)
select {
case <-ctx.Done():
timer.Stop()
return ctx.Err()
case <-timer.C:
// Continue with operation, this was a regular wait
}

return nil
}
13 changes: 7 additions & 6 deletions tests/mock/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,9 @@ type fakeMaintenanceTask struct {
actionGroupStates []*ActionGroupStates
}

type AdditionalMockBehaviour struct {
type AdditionalTestBehaviour struct {
RestartNodesOnNewVersion string
SignalDelayMs int // Send SIGTERM after this delay in milliseconds
}

type YdbMock struct {
Expand All @@ -51,7 +52,7 @@ type YdbMock struct {
grpcServer *grpc.Server
caFile string
keyFile string
additionalMockBehaviour AdditionalMockBehaviour
additionalTestBehaviour AdditionalTestBehaviour

// This field contains the list of Nodes that is suitable to return
// to ListClusterNodes request from rolling restart.
Expand Down Expand Up @@ -174,15 +175,15 @@ func (s *YdbMock) CompleteAction(ctx context.Context, req *CompleteActionRequest
for _, completedActionUid := range req.ActionUids {
task := s.tasks[completedActionUid.TaskUid]

if s.additionalMockBehaviour.RestartNodesOnNewVersion != "" {
if s.additionalTestBehaviour.RestartNodesOnNewVersion != "" {
for _, actionGroup := range task.actionGroups {
lock := actionGroup.Actions[0].GetLockAction()
nodeHost := lock.Scope.GetHost()
nodeId := lock.Scope.GetNodeId()

for _, node := range s.nodes {
if node.NodeId == nodeId || node.Host == nodeHost {
node.Version = s.additionalMockBehaviour.RestartNodesOnNewVersion
node.Version = s.additionalTestBehaviour.RestartNodesOnNewVersion
}
}
}
Expand Down Expand Up @@ -302,6 +303,6 @@ func (s *YdbMock) Teardown() {
s.grpcServer.GracefulStop()
}

func (s *YdbMock) SetMockBehaviour(additionalMockBehaviour AdditionalMockBehaviour) {
s.additionalMockBehaviour = additionalMockBehaviour
func (s *YdbMock) SetMockBehaviour(additionalMockBehaviour AdditionalTestBehaviour) {
s.additionalTestBehaviour = additionalMockBehaviour
}
71 changes: 70 additions & 1 deletion tests/rolling_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -825,7 +825,7 @@ var _ = Describe("Test Rolling", func() {
Version: "23.3.1",
},
},
additionalMockBehaviour: &mock.AdditionalMockBehaviour{
additionalTestBehaviour: &mock.AdditionalTestBehaviour{
RestartNodesOnNewVersion: "24.1.1",
},
steps: []StepData{
Expand Down Expand Up @@ -881,5 +881,74 @@ var _ = Describe("Test Rolling", func() {
},
},
),
Entry("Interrupt rolling restart with SIGTERM", TestCase{
nodeConfiguration: [][]uint32{
{1, 2, 3},
},
nodeInfoMap: map[uint32]mock.TestNodeInfo{
1: {IsDynnode: false},
2: {IsDynnode: false},
3: {IsDynnode: false},
},
additionalTestBehaviour: &mock.AdditionalTestBehaviour{
SignalDelayMs: 500, // this is carefully crafted to fire just when `ydbops` waits for second node
},
steps: []StepData{
{
ydbopsInvocation: []string{
"--endpoint", "grpcs://localhost:2135",
"--verbose",
"--availability-mode", "strong",
"--user", mock.TestUser,
"--cms-query-interval", "1",
"run",
"--hosts", "1,2",
"--payload", filepath.Join(".", "mock", "noop-payload.sh"),
"--ca-file", filepath.Join(".", "test-data", "ssl-data", "ca.crt"),
"--cleanup-on-exit",
},
expectedRequests: []proto.Message{
&Ydb_Auth.LoginRequest{
User: mock.TestUser,
Password: mock.TestPassword,
},
&Ydb_Maintenance.ListClusterNodesRequest{},
&Ydb_Cms.ListDatabasesRequest{},
&Ydb_Discovery.WhoAmIRequest{},
&Ydb_Maintenance.ListMaintenanceTasksRequest{
User: &mock.TestUser,
},
&Ydb_Maintenance.CreateMaintenanceTaskRequest{
TaskOptions: &Ydb_Maintenance.MaintenanceTaskOptions{
TaskUid: "task-UUID-1",
Description: "Rolling restart maintenance task",
AvailabilityMode: Ydb_Maintenance.AvailabilityMode_AVAILABILITY_MODE_STRONG,
},
ActionGroups: mock.MakeActionGroupsFromNodeIds(1, 2),
},
&Ydb_Maintenance.CompleteActionRequest{
ActionUids: []*Ydb_Maintenance.ActionUid{
{
TaskUid: "task-UUID-1",
GroupId: "group-UUID-1",
ActionId: "action-UUID-1",
},
},
},
&Ydb_Maintenance.ListMaintenanceTasksRequest{
User: &mock.TestUser,
},
&Ydb_Maintenance.GetMaintenanceTaskRequest{
TaskUid: "task-UUID-1",
},
&Ydb_Maintenance.DropMaintenanceTaskRequest{
TaskUid: "task-UUID-1",
},
},
expectedOutputRegexps: []string{},
},
},
},
),
)
})
Loading
Loading