Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
wzshiming committed Nov 30, 2023
1 parent d1902d2 commit 95e2cda
Show file tree
Hide file tree
Showing 9 changed files with 100 additions and 29 deletions.
27 changes: 27 additions & 0 deletions pkg/kwok/controllers/node_lease_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,24 @@ func (c *NodeLeaseController) sync(ctx context.Context, nodeName string) {
logger.Info("Creating lease")
latestLease, err := c.ensureLease(ctx, nodeName)
if err != nil {
if apierrors.IsAlreadyExists(err) {
logger.Error("failed to create lease, lease already exists", err)

_, err = c.syncLease(ctx, nodeName)
if err != nil {
logger.Error("failed to sync lease", err)
return
}
if c.onNodeManagedFunc != nil {
if c.Held(nodeName) {
c.onNodeManagedFunc(nodeName)
} else {
logger.Warn("Lease not held")
}
}
return
}

if !apierrors.IsNotFound(err) || !c.latestLease.IsEmpty() {
logger.Error("failed to create lease", err)
return
Expand All @@ -243,6 +261,15 @@ func (c *NodeLeaseController) sync(ctx context.Context, nodeName string) {
}
}

func (c *NodeLeaseController) syncLease(ctx context.Context, leaseName string) (*coordinationv1.Lease, error) {
lease, err := c.typedClient.CoordinationV1().Leases(corev1.NamespaceNodeLease).Get(ctx, leaseName, metav1.GetOptions{})
if err != nil {
return nil, err
}
c.latestLease.Store(leaseName, lease)
return lease, nil
}

// ensureLease creates a lease if it does not exist
func (c *NodeLeaseController) ensureLease(ctx context.Context, leaseName string) (*coordinationv1.Lease, error) {
lease := &coordinationv1.Lease{
Expand Down
16 changes: 16 additions & 0 deletions pkg/kwokctl/runtime/binary/cluster_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,22 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error {
logger.Error("Failed to start", err, "component", component)
}
}

components := []string{
consts.ComponentKwokController,
consts.ComponentKubeControllerManager,
consts.ComponentKubeScheduler,
}
for _, component := range components {
err := c.StopComponent(ctx, component)
if err != nil {
logger.Error("Failed to stop", err, "component", component)
}
err = c.StartComponent(ctx, component)
if err != nil {
logger.Error("Failed to start", err, "component", component)
}
}
}()

etcdDataTmp := c.GetWorkdirPath("etcd-data")
Expand Down
32 changes: 32 additions & 0 deletions pkg/kwokctl/runtime/compose/cluster_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,22 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error {
logger.Error("Failed to start", err, "component", component)
}
}

components := []string{
consts.ComponentKwokController,
consts.ComponentKubeControllerManager,
consts.ComponentKubeScheduler,
}
for _, component := range components {
err := c.StopComponent(ctx, component)
if err != nil {
logger.Error("Failed to stop", err, "component", component)
}
err = c.StartComponent(ctx, component)
if err != nil {
logger.Error("Failed to start", err, "component", component)
}
}
}()

// Copy to container from host temporary directory
Expand Down Expand Up @@ -139,6 +155,22 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error {
logger.Error("Failed to start", err, "component", component)
}
}

components = []string{
consts.ComponentKwokController,
consts.ComponentKubeControllerManager,
consts.ComponentKubeScheduler,
}
for _, component := range components {
err := c.StopComponent(ctx, component)
if err != nil {
logger.Error("Failed to stop", err, "component", component)
}
err = c.StartComponent(ctx, component)
if err != nil {
logger.Error("Failed to start", err, "component", component)
}
}
}()
}

Expand Down
3 changes: 3 additions & 0 deletions pkg/kwokctl/runtime/kind/cluster_snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,9 @@ func (c *Cluster) SnapshotRestore(ctx context.Context, path string) error {

components := []string{
consts.ComponentKubeApiserver,
consts.ComponentKwokController,
consts.ComponentKubeControllerManager,
consts.ComponentKubeScheduler,
}
for _, component := range components {
err := c.Exec(ctx, c.runtime, "exec", clusterName, "pkill", "-9", component)
Expand Down
6 changes: 0 additions & 6 deletions test/e2e/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"context"
"os"
"testing"
"time"

"github.com/google/go-cmp/cmp"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -91,11 +90,6 @@ func CaseSnapshot(kwokctlPath, clusterName string, tmpDir string) *features.Feat
}
return ctx
}).
Assess("sleep", func(ctx context.Context, t *testing.T, config *envconf.Config) context.Context {
// TODO: remove this sleep
time.Sleep(60 * time.Second)
return ctx
}).
Assess("delete pod0", helper.DeletePod(pod0)).
Assess("delete node0", helper.DeleteNode(node))
}
16 changes: 8 additions & 8 deletions test/kwokctl/testdata/binary/snapshot_restore_etcd.txt
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-controller-manager.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-controller-manager.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-scheduler.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-scheduler.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kwok-controller.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kwok-controller.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-apiserver.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-apiserver.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/etcd.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/etcd.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-apiserver.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-apiserver.pid
rm -rf <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
# Download https://github.com/etcd-io/etcd/releases/download/v3.5.9/etcd-v3.5.9-<OS>-<ARCH>.<TAR> and extract etcdctl to <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/bin/etcdctl
ETCDCTL_API=3 etcdctl --endpoints 127.0.0.1:32765 snapshot restore ./snapshot-empty-<CLUSTER_NAME> --data-dir <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
rm -rf <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd
mv <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kwok-controller.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kwok-controller.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-controller-manager.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-controller-manager.pid
kill $(cat <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-scheduler.pid)
rm <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/pids/kube-scheduler.pid
10 changes: 5 additions & 5 deletions test/kwokctl/testdata/docker/snapshot_restore_etcd.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Download https://github.com/etcd-io/etcd/releases/download/v3.5.9/etcd-v3.5.9-<OS>-<ARCH>.<TAR> and extract etcdctl to <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/bin/etcdctl
ETCDCTL_API=3 etcdctl snapshot restore ./snapshot-empty-<CLUSTER_NAME> --data-dir <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
docker stop kwok-<CLUSTER_NAME>-kube-controller-manager --time=0
docker stop kwok-<CLUSTER_NAME>-kube-scheduler --time=0
docker stop kwok-<CLUSTER_NAME>-kwok-controller --time=0
docker stop kwok-<CLUSTER_NAME>-kube-apiserver --time=0
docker stop kwok-<CLUSTER_NAME>-etcd --time=0
docker stop kwok-<CLUSTER_NAME>-kube-apiserver --time=0
docker cp <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data kwok-<CLUSTER_NAME>-etcd:/
docker start kwok-<CLUSTER_NAME>-etcd
docker start kwok-<CLUSTER_NAME>-kube-apiserver
docker stop kwok-<CLUSTER_NAME>-kwok-controller --time=0
docker start kwok-<CLUSTER_NAME>-kwok-controller
docker stop kwok-<CLUSTER_NAME>-kube-controller-manager --time=0
docker start kwok-<CLUSTER_NAME>-kube-controller-manager
docker stop kwok-<CLUSTER_NAME>-kube-scheduler --time=0
docker start kwok-<CLUSTER_NAME>-kube-scheduler
docker start kwok-<CLUSTER_NAME>-kwok-controller
rm -rf <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
9 changes: 4 additions & 5 deletions test/kwokctl/testdata/nerdctl/snapshot_restore_etcd.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,14 @@
ETCDCTL_API=3 etcdctl snapshot restore ./snapshot-empty-<CLUSTER_NAME> --data-dir <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
nerdctl stop kwok-<CLUSTER_NAME>-kube-apiserver --time=0
nerdctl cp <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data kwok-<CLUSTER_NAME>-etcd:/
nerdctl stop kwok-<CLUSTER_NAME>-kube-controller-manager --time=0
nerdctl stop kwok-<CLUSTER_NAME>-kube-scheduler --time=0
nerdctl stop kwok-<CLUSTER_NAME>-kwok-controller --time=0
nerdctl stop kwok-<CLUSTER_NAME>-kube-apiserver --time=0
nerdctl stop kwok-<CLUSTER_NAME>-etcd --time=0
nerdctl start kwok-<CLUSTER_NAME>-etcd
nerdctl start kwok-<CLUSTER_NAME>-kube-apiserver
nerdctl stop kwok-<CLUSTER_NAME>-kwok-controller --time=0
nerdctl start kwok-<CLUSTER_NAME>-kwok-controller
nerdctl stop kwok-<CLUSTER_NAME>-kube-controller-manager --time=0
nerdctl start kwok-<CLUSTER_NAME>-kube-controller-manager
nerdctl stop kwok-<CLUSTER_NAME>-kube-scheduler --time=0
nerdctl start kwok-<CLUSTER_NAME>-kube-scheduler
nerdctl start kwok-<CLUSTER_NAME>-kwok-controller
nerdctl start kwok-<CLUSTER_NAME>-kube-apiserver
rm -rf <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
10 changes: 5 additions & 5 deletions test/kwokctl/testdata/podman/snapshot_restore_etcd.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
# Download https://github.com/etcd-io/etcd/releases/download/v3.5.9/etcd-v3.5.9-<OS>-<ARCH>.<TAR> and extract etcdctl to <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/bin/etcdctl
ETCDCTL_API=3 etcdctl snapshot restore ./snapshot-empty-<CLUSTER_NAME> --data-dir <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data
podman stop kwok-<CLUSTER_NAME>-kube-controller-manager --time=0
podman stop kwok-<CLUSTER_NAME>-kube-scheduler --time=0
podman stop kwok-<CLUSTER_NAME>-kwok-controller --time=0
podman stop kwok-<CLUSTER_NAME>-kube-apiserver --time=0
podman stop kwok-<CLUSTER_NAME>-etcd --time=0
podman stop kwok-<CLUSTER_NAME>-kube-apiserver --time=0
podman cp <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data kwok-<CLUSTER_NAME>-etcd:/
podman start kwok-<CLUSTER_NAME>-etcd
podman start kwok-<CLUSTER_NAME>-kube-apiserver
podman stop kwok-<CLUSTER_NAME>-kwok-controller --time=0
podman start kwok-<CLUSTER_NAME>-kwok-controller
podman stop kwok-<CLUSTER_NAME>-kube-controller-manager --time=0
podman start kwok-<CLUSTER_NAME>-kube-controller-manager
podman stop kwok-<CLUSTER_NAME>-kube-scheduler --time=0
podman start kwok-<CLUSTER_NAME>-kube-scheduler
podman start kwok-<CLUSTER_NAME>-kwok-controller
rm -rf <ROOT_DIR>/workdir/clusters/<CLUSTER_NAME>/etcd-data

0 comments on commit 95e2cda

Please sign in to comment.