From 95e2cda527efcaa8d42fef3a990bac668548cb5d Mon Sep 17 00:00:00 2001 From: Shiming Zhang Date: Thu, 30 Nov 2023 16:35:58 +0800 Subject: [PATCH] Fix --- pkg/kwok/controllers/node_lease_controller.go | 27 ++++++++++++++++ .../runtime/binary/cluster_snapshot.go | 16 ++++++++++ .../runtime/compose/cluster_snapshot.go | 32 +++++++++++++++++++ pkg/kwokctl/runtime/kind/cluster_snapshot.go | 3 ++ test/e2e/snapshot.go | 6 ---- .../testdata/binary/snapshot_restore_etcd.txt | 16 +++++----- .../testdata/docker/snapshot_restore_etcd.txt | 10 +++--- .../nerdctl/snapshot_restore_etcd.txt | 9 +++--- .../testdata/podman/snapshot_restore_etcd.txt | 10 +++--- 9 files changed, 100 insertions(+), 29 deletions(-) diff --git a/pkg/kwok/controllers/node_lease_controller.go b/pkg/kwok/controllers/node_lease_controller.go index a7a3619cc5..9e63e18429 100644 --- a/pkg/kwok/controllers/node_lease_controller.go +++ b/pkg/kwok/controllers/node_lease_controller.go @@ -217,6 +217,24 @@ func (c *NodeLeaseController) sync(ctx context.Context, nodeName string) { logger.Info("Creating lease") latestLease, err := c.ensureLease(ctx, nodeName) if err != nil { + if apierrors.IsAlreadyExists(err) { + logger.Error("failed to create lease, lease already exists", err) + + _, err = c.syncLease(ctx, nodeName) + if err != nil { + logger.Error("failed to sync lease", err) + return + } + if c.onNodeManagedFunc != nil { + if c.Held(nodeName) { + c.onNodeManagedFunc(nodeName) + } else { + logger.Warn("Lease not held") + } + } + return + } + if !apierrors.IsNotFound(err) || !c.latestLease.IsEmpty() { logger.Error("failed to create lease", err) return @@ -243,6 +261,15 @@ func (c *NodeLeaseController) sync(ctx context.Context, nodeName string) { } } +func (c *NodeLeaseController) syncLease(ctx context.Context, leaseName string) (*coordinationv1.Lease, error) { + lease, err := c.typedClient.CoordinationV1().Leases(corev1.NamespaceNodeLease).Get(ctx, leaseName, metav1.GetOptions{}) + if err != nil { + return nil, err + } + c.latestLease.Store(leaseName, lease) + return lease, nil +} + // ensureLease creates a lease if it does not exist func (c *NodeLeaseController) ensureLease(ctx context.Context, leaseName string) (*coordinationv1.Lease, error) { lease := &coordinationv1.Lease{ diff --git a/pkg/kwokctl/runtime/binary/cluster_snapshot.go b/pkg/kwokctl/runtime/binary/cluster_snapshot.go index 4cce3469d5..42864d1715 100644 --- a/pkg/kwokctl/runtime/binary/cluster_snapshot.go +++ b/pkg/kwokctl/runtime/binary/cluster_snapshot.go @@ -57,6 +57,22 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error { logger.Error("Failed to start", err, "component", component) } } + + components := []string{ + consts.ComponentKwokController, + consts.ComponentKubeControllerManager, + consts.ComponentKubeScheduler, + } + for _, component := range components { + err := c.StopComponent(ctx, component) + if err != nil { + logger.Error("Failed to stop", err, "component", component) + } + err = c.StartComponent(ctx, component) + if err != nil { + logger.Error("Failed to start", err, "component", component) + } + } }() etcdDataTmp := c.GetWorkdirPath("etcd-data") diff --git a/pkg/kwokctl/runtime/compose/cluster_snapshot.go b/pkg/kwokctl/runtime/compose/cluster_snapshot.go index 8a0fa625f3..eb4e429739 100644 --- a/pkg/kwokctl/runtime/compose/cluster_snapshot.go +++ b/pkg/kwokctl/runtime/compose/cluster_snapshot.go @@ -89,6 +89,22 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error { logger.Error("Failed to start", err, "component", component) } } + + components := []string{ + consts.ComponentKwokController, + consts.ComponentKubeControllerManager, + consts.ComponentKubeScheduler, + } + for _, component := range components { + err := c.StopComponent(ctx, component) + if err != nil { + logger.Error("Failed to stop", err, "component", component) + } + err = c.StartComponent(ctx, component) + if err != nil { + logger.Error("Failed to start", err, "component", component) + } + } }() // Copy to container from host temporary directory @@ -139,6 +155,22 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error { logger.Error("Failed to start", err, "component", component) } } + + components = []string{ + consts.ComponentKwokController, + consts.ComponentKubeControllerManager, + consts.ComponentKubeScheduler, + } + for _, component := range components { + err := c.StopComponent(ctx, component) + if err != nil { + logger.Error("Failed to stop", err, "component", component) + } + err = c.StartComponent(ctx, component) + if err != nil { + logger.Error("Failed to start", err, "component", component) + } + } }() } diff --git a/pkg/kwokctl/runtime/kind/cluster_snapshot.go b/pkg/kwokctl/runtime/kind/cluster_snapshot.go index 8349343c1d..c91d040f4e 100644 --- a/pkg/kwokctl/runtime/kind/cluster_snapshot.go +++ b/pkg/kwokctl/runtime/kind/cluster_snapshot.go @@ -78,6 +78,9 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error { components := []string{ consts.ComponentKubeApiserver, + consts.ComponentKwokController, + consts.ComponentKubeControllerManager, + consts.ComponentKubeScheduler, } for _, component := range components { err := c.Exec(ctx, c.runtime, "exec", clusterName, "pkill", "-9", component) diff --git a/test/e2e/snapshot.go b/test/e2e/snapshot.go index efe48df6be..5651bfe1a9 100644 --- a/test/e2e/snapshot.go +++ b/test/e2e/snapshot.go @@ -20,7 +20,6 @@ import ( "context" "os" "testing" - "time" "github.com/google/go-cmp/cmp" corev1 "k8s.io/api/core/v1" @@ -91,11 +90,6 @@ func CaseSnapshot(kwokctlPath, clusterName string, tmpDir string) *features.Feat } return ctx }). - Assess("sleep", func(ctx context.Context, t *testing.T, config *envconf.Config) context.Context { - // TODO: remove this sleep - time.Sleep(60 * time.Second) - return ctx - }). Assess("delete pod0", helper.DeletePod(pod0)). Assess("delete node0", helper.DeleteNode(node)) } diff --git a/test/kwokctl/testdata/binary/snapshot_restore_etcd.txt b/test/kwokctl/testdata/binary/snapshot_restore_etcd.txt index 1363513488..835a421814 100644 --- a/test/kwokctl/testdata/binary/snapshot_restore_etcd.txt +++ b/test/kwokctl/testdata/binary/snapshot_restore_etcd.txt @@ -1,15 +1,15 @@ -kill $(cat /workdir/clusters//pids/kube-controller-manager.pid) -rm /workdir/clusters//pids/kube-controller-manager.pid -kill $(cat /workdir/clusters//pids/kube-scheduler.pid) -rm /workdir/clusters//pids/kube-scheduler.pid -kill $(cat /workdir/clusters//pids/kwok-controller.pid) -rm /workdir/clusters//pids/kwok-controller.pid -kill $(cat /workdir/clusters//pids/kube-apiserver.pid) -rm /workdir/clusters//pids/kube-apiserver.pid kill $(cat /workdir/clusters//pids/etcd.pid) rm /workdir/clusters//pids/etcd.pid +kill $(cat /workdir/clusters//pids/kube-apiserver.pid) +rm /workdir/clusters//pids/kube-apiserver.pid rm -rf /workdir/clusters//etcd-data # Download https://github.com/etcd-io/etcd/releases/download/v3.5.9/etcd-v3.5.9--. and extract etcdctl to /workdir/clusters//bin/etcdctl ETCDCTL_API=3 etcdctl --endpoints 127.0.0.1:32765 snapshot restore ./snapshot-empty- --data-dir /workdir/clusters//etcd-data rm -rf /workdir/clusters//etcd mv /workdir/clusters//etcd-data /workdir/clusters//etcd +kill $(cat /workdir/clusters//pids/kwok-controller.pid) +rm /workdir/clusters//pids/kwok-controller.pid +kill $(cat /workdir/clusters//pids/kube-controller-manager.pid) +rm /workdir/clusters//pids/kube-controller-manager.pid +kill $(cat /workdir/clusters//pids/kube-scheduler.pid) +rm /workdir/clusters//pids/kube-scheduler.pid diff --git a/test/kwokctl/testdata/docker/snapshot_restore_etcd.txt b/test/kwokctl/testdata/docker/snapshot_restore_etcd.txt index f07eb5faf0..eb44ab7da4 100644 --- a/test/kwokctl/testdata/docker/snapshot_restore_etcd.txt +++ b/test/kwokctl/testdata/docker/snapshot_restore_etcd.txt @@ -1,14 +1,14 @@ # Download https://github.com/etcd-io/etcd/releases/download/v3.5.9/etcd-v3.5.9--. and extract etcdctl to /workdir/clusters//bin/etcdctl ETCDCTL_API=3 etcdctl snapshot restore ./snapshot-empty- --data-dir /workdir/clusters//etcd-data -docker stop kwok--kube-controller-manager --time=0 -docker stop kwok--kube-scheduler --time=0 -docker stop kwok--kwok-controller --time=0 -docker stop kwok--kube-apiserver --time=0 docker stop kwok--etcd --time=0 +docker stop kwok--kube-apiserver --time=0 docker cp /workdir/clusters//etcd-data kwok--etcd:/ docker start kwok--etcd docker start kwok--kube-apiserver +docker stop kwok--kwok-controller --time=0 +docker start kwok--kwok-controller +docker stop kwok--kube-controller-manager --time=0 docker start kwok--kube-controller-manager +docker stop kwok--kube-scheduler --time=0 docker start kwok--kube-scheduler -docker start kwok--kwok-controller rm -rf /workdir/clusters//etcd-data diff --git a/test/kwokctl/testdata/nerdctl/snapshot_restore_etcd.txt b/test/kwokctl/testdata/nerdctl/snapshot_restore_etcd.txt index 60950838fa..12b72cc41d 100644 --- a/test/kwokctl/testdata/nerdctl/snapshot_restore_etcd.txt +++ b/test/kwokctl/testdata/nerdctl/snapshot_restore_etcd.txt @@ -2,15 +2,14 @@ ETCDCTL_API=3 etcdctl snapshot restore ./snapshot-empty- --data-dir /workdir/clusters//etcd-data nerdctl stop kwok--kube-apiserver --time=0 nerdctl cp /workdir/clusters//etcd-data kwok--etcd:/ -nerdctl stop kwok--kube-controller-manager --time=0 -nerdctl stop kwok--kube-scheduler --time=0 -nerdctl stop kwok--kwok-controller --time=0 -nerdctl stop kwok--kube-apiserver --time=0 nerdctl stop kwok--etcd --time=0 nerdctl start kwok--etcd nerdctl start kwok--kube-apiserver +nerdctl stop kwok--kwok-controller --time=0 +nerdctl start kwok--kwok-controller +nerdctl stop kwok--kube-controller-manager --time=0 nerdctl start kwok--kube-controller-manager +nerdctl stop kwok--kube-scheduler --time=0 nerdctl start kwok--kube-scheduler -nerdctl start kwok--kwok-controller nerdctl start kwok--kube-apiserver rm -rf /workdir/clusters//etcd-data diff --git a/test/kwokctl/testdata/podman/snapshot_restore_etcd.txt b/test/kwokctl/testdata/podman/snapshot_restore_etcd.txt index 5154a2513b..5cc902213a 100644 --- a/test/kwokctl/testdata/podman/snapshot_restore_etcd.txt +++ b/test/kwokctl/testdata/podman/snapshot_restore_etcd.txt @@ -1,14 +1,14 @@ # Download https://github.com/etcd-io/etcd/releases/download/v3.5.9/etcd-v3.5.9--. and extract etcdctl to /workdir/clusters//bin/etcdctl ETCDCTL_API=3 etcdctl snapshot restore ./snapshot-empty- --data-dir /workdir/clusters//etcd-data -podman stop kwok--kube-controller-manager --time=0 -podman stop kwok--kube-scheduler --time=0 -podman stop kwok--kwok-controller --time=0 -podman stop kwok--kube-apiserver --time=0 podman stop kwok--etcd --time=0 +podman stop kwok--kube-apiserver --time=0 podman cp /workdir/clusters//etcd-data kwok--etcd:/ podman start kwok--etcd podman start kwok--kube-apiserver +podman stop kwok--kwok-controller --time=0 +podman start kwok--kwok-controller +podman stop kwok--kube-controller-manager --time=0 podman start kwok--kube-controller-manager +podman stop kwok--kube-scheduler --time=0 podman start kwok--kube-scheduler -podman start kwok--kwok-controller rm -rf /workdir/clusters//etcd-data