diff --git a/internal/controller/drplacementcontrol.go b/internal/controller/drplacementcontrol.go index cc3ac787c..4ce206d5a 100644 --- a/internal/controller/drplacementcontrol.go +++ b/internal/controller/drplacementcontrol.go @@ -825,6 +825,14 @@ func (d *DRPCInstance) RunRelocate() (bool, error) { const done = true + if d.reconciler.numClustersQueriedSuccessfully != len(d.drPolicy.Spec.DRClusters) { + d.log.Info("Can't progress with relocation -- Not all clusters are reachable", + "numClustersQueriedSuccessfully", d.reconciler.numClustersQueriedSuccessfully, + "NumOfClusters", len(d.drPolicy.Spec.DRClusters)) + + return !done, nil + } + preferredCluster := d.instance.Spec.PreferredCluster preferredClusterNamespace := d.instance.Spec.PreferredCluster diff --git a/internal/controller/drplacementcontrol_controller.go b/internal/controller/drplacementcontrol_controller.go index 59b0f9a15..227b2af7c 100644 --- a/internal/controller/drplacementcontrol_controller.go +++ b/internal/controller/drplacementcontrol_controller.go @@ -68,15 +68,16 @@ type ProgressCallback func(string, string) // DRPlacementControlReconciler reconciles a DRPlacementControl object type DRPlacementControlReconciler struct { client.Client - APIReader client.Reader - Log logr.Logger - MCVGetter rmnutil.ManagedClusterViewGetter - Scheme *runtime.Scheme - Callback ProgressCallback - eventRecorder *rmnutil.EventReporter - savedInstanceStatus rmn.DRPlacementControlStatus - ObjStoreGetter ObjectStoreGetter - RateLimiter *workqueue.TypedRateLimiter[reconcile.Request] + APIReader client.Reader + Log logr.Logger + MCVGetter rmnutil.ManagedClusterViewGetter + Scheme *runtime.Scheme + Callback ProgressCallback + eventRecorder *rmnutil.EventReporter + savedInstanceStatus rmn.DRPlacementControlStatus + ObjStoreGetter ObjectStoreGetter + RateLimiter *workqueue.TypedRateLimiter[reconcile.Request] + numClustersQueriedSuccessfully int } // SetupWithManager sets up the controller with the Manager. @@ -373,11 +374,13 @@ func (r *DRPlacementControlReconciler) createDRPCInstance( return nil, err } - vrgs, _, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) + vrgs, cqs, _, err := getVRGsFromManagedClusters(r.MCVGetter, drpc, drClusters, vrgNamespace, log) if err != nil { return nil, err } + r.numClustersQueriedSuccessfully = cqs + d := &DRPCInstance{ reconciler: r, ctx: ctx, @@ -1097,7 +1100,7 @@ func getVRGsFromManagedClusters( annotations[DRPCNameAnnotation] = drpc.Name annotations[DRPCNamespaceAnnotation] = drpc.Namespace - var clustersQueriedSuccessfully int + var numClustersQueriedSuccessfully int var failedCluster string @@ -1109,7 +1112,7 @@ func getVRGsFromManagedClusters( // Only NotFound error is accepted if errors.IsNotFound(err) { log.Info(fmt.Sprintf("VRG not found on %q", drCluster.Name)) - clustersQueriedSuccessfully++ + numClustersQueriedSuccessfully++ continue } @@ -1121,7 +1124,7 @@ func getVRGsFromManagedClusters( continue } - clustersQueriedSuccessfully++ + numClustersQueriedSuccessfully++ if rmnutil.ResourceIsDeleted(drCluster) { log.Info("Skipping VRG on deleted drcluster", "drcluster", drCluster.Name, "vrg", vrg.Name) @@ -1135,15 +1138,15 @@ func getVRGsFromManagedClusters( } // We are done if we successfully queried all drClusters - if clustersQueriedSuccessfully == len(drClusters) { - return vrgs, clustersQueriedSuccessfully, "", nil + if numClustersQueriedSuccessfully == len(drClusters) { + return vrgs, numClustersQueriedSuccessfully, "", nil } - if clustersQueriedSuccessfully == 0 { + if numClustersQueriedSuccessfully == 0 { return vrgs, 0, "", fmt.Errorf("failed to retrieve VRGs from clusters") } - return vrgs, clustersQueriedSuccessfully, failedCluster, nil + return vrgs, numClustersQueriedSuccessfully, failedCluster, nil } func (r *DRPlacementControlReconciler) deleteClonedPlacementRule(ctx context.Context, diff --git a/internal/controller/drplacementcontrol_controller_test.go b/internal/controller/drplacementcontrol_controller_test.go index af1eca6bf..d65596972 100644 --- a/internal/controller/drplacementcontrol_controller_test.go +++ b/internal/controller/drplacementcontrol_controller_test.go @@ -2479,8 +2479,8 @@ var _ = Describe("DRPlacementControl Reconciler", func() { clearFakeUserPlacementRuleStatus(UserPlacementRuleName, DefaultDRPCNamespace) clearDRPCStatus() expectedAction := rmn.ActionRelocate - expectedPhase := rmn.Relocated - exptectedPorgression := rmn.ProgressionCleaningUp + expectedPhase := rmn.DRState("") + exptectedPorgression := rmn.ProgressionStatus("") verifyDRPCStateAndProgression(expectedAction, expectedPhase, exptectedPorgression) // User intervention is required (simulate user intervention) diff --git a/internal/controller/volsync/vshandler_test.go b/internal/controller/volsync/vshandler_test.go index e9995845a..8de0105cd 100644 --- a/internal/controller/volsync/vshandler_test.go +++ b/internal/controller/volsync/vshandler_test.go @@ -1216,14 +1216,12 @@ var _ = Describe("VolSync_Handler", func() { pvc := &corev1.PersistentVolumeClaim{} JustBeforeEach(func() { // Common checks for everything in this context - pvc should be created with correct spec - Expect(ensurePVCErr).NotTo(HaveOccurred()) - Eventually(func() error { return k8sClient.Get(ctx, types.NamespacedName{ Name: pvcName, Namespace: testNamespace.GetName(), }, pvc) - }, maxWait, interval).Should(Succeed()) + }, maxWait, interval).Should(Succeed(), fmt.Sprintf("Original error %v", ensurePVCErr)) Expect(pvc.GetName()).To(Equal(pvcName)) Expect(pvc.Spec.AccessModes).To(Equal([]corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce})) diff --git a/internal/controller/vrg_volrep_test.go b/internal/controller/vrg_volrep_test.go index 73e4e3e20..74e9027e9 100644 --- a/internal/controller/vrg_volrep_test.go +++ b/internal/controller/vrg_volrep_test.go @@ -614,6 +614,150 @@ var _ = Describe("VolumeReplicationGroupVolRepController", func() { }) }) + // Test VRG deletion when VR failed validation + var vrgDeleteFailedVR *vrgTest + //nolint:dupl + Context("VR failed validation in primary state", func() { + createTestTemplate := &template{ + ClaimBindInfo: corev1.ClaimBound, + VolumeBindInfo: corev1.VolumeBound, + schedulingInterval: "1h", + storageClassName: "manual", + replicationClassName: "test-replicationclass", + vrcProvisioner: "manual.storage.com", + scProvisioner: "manual.storage.com", + replicationClassLabels: map[string]string{"protection": "ramen"}, + } + It("sets up PVCs, PVs and VRGs (with s3 stores that fail uploads)", func() { + createTestTemplate.s3Profiles = []string{s3Profiles[vrgS3ProfileNumber].S3ProfileName} + vrgDeleteFailedVR = newVRGTestCaseCreateAndStart(1, createTestTemplate, true, false) + }) + It("waits for VRG to create a VR for each PVC", func() { + expectedVRCount := len(vrgDeleteFailedVR.pvcNames) + vrgDeleteFailedVR.waitForVRCountToMatch(expectedVRCount) + }) + It("simulate VR with failed validation", func() { + vrgDeleteFailedVR.promoteVolRepsWithOptions(promoteOptions{ValidatedFailed: true}) + }) + It("VRG can be deleted", func() { + By("deleting the VRG") + vrg := vrgDeleteFailedVR.getVRG() + Expect(k8sClient.Delete(context.TODO(), vrg)).To(Succeed()) + + By("ensuring VRG is deleted") + Eventually(func() error { + return apiReader.Get(context.TODO(), vrgDeleteFailedVR.vrgNamespacedName(), vrg) + }, vrgtimeout, vrginterval). + Should(MatchError(errors.NewNotFound(schema.GroupResource{ + Group: ramendrv1alpha1.GroupVersion.Group, + Resource: "volumereplicationgroups", + }, vrgDeleteFailedVR.vrgName))) + + vrgDeleteFailedVR.cleanupNamespace() + vrgDeleteFailedVR.cleanupSC() + vrgDeleteFailedVR.cleanupVRC() + }) + }) + + // Test VRG deletion when VR failed validation and Validated condition is missing (csi-addons < 0.10.0) + var vrgDeleteIncompleteVR *vrgTest + //nolint:dupl + Context("VR failed validation in primary state and Validated condition is missing", func() { + createTestTemplate := &template{ + ClaimBindInfo: corev1.ClaimBound, + VolumeBindInfo: corev1.VolumeBound, + schedulingInterval: "1h", + storageClassName: "manual", + replicationClassName: "test-replicationclass", + vrcProvisioner: "manual.storage.com", + scProvisioner: "manual.storage.com", + replicationClassLabels: map[string]string{"protection": "ramen"}, + } + It("sets up PVCs, PVs and VRGs (with s3 stores that fail uploads)", func() { + createTestTemplate.s3Profiles = []string{s3Profiles[vrgS3ProfileNumber].S3ProfileName} + vrgDeleteIncompleteVR = newVRGTestCaseCreateAndStart(1, createTestTemplate, true, false) + }) + It("waits for VRG to create a VR for each PVC", func() { + expectedVRCount := len(vrgDeleteFailedVR.pvcNames) + vrgDeleteIncompleteVR.waitForVRCountToMatch(expectedVRCount) + }) + It("simulate incomplete VR", func() { + vrgDeleteIncompleteVR.promoteVolRepsWithOptions(promoteOptions{ValidatedFailed: true, ValidatedMissing: true}) + }) + It("VRG can not be deleted", func() { + By("deleting the VRG") + vrg := vrgDeleteIncompleteVR.getVRG() + Expect(k8sClient.Delete(context.TODO(), vrg)).To(Succeed()) + + By("ensuring VRG cannot be deleted") + Consistently(func() error { + return apiReader.Get(context.TODO(), vrgDeleteIncompleteVR.vrgNamespacedName(), vrg) + }, vrgtimeout, vrginterval). + Should(Succeed(), "VRG %s was deleted when VR is incomplete", vrgDeleteIncompleteVR.vrgName) + + By("deleting the VRs") + vrgDeleteIncompleteVR.deleteVolReps() + + By("ensuring the VRG is deleted") + Eventually(func() error { + return apiReader.Get(context.TODO(), vrgDeleteFailedVR.vrgNamespacedName(), vrg) + }, vrgtimeout, vrginterval). + Should(MatchError(errors.NewNotFound(schema.GroupResource{ + Group: ramendrv1alpha1.GroupVersion.Group, + Resource: "volumereplicationgroups", + }, vrgDeleteFailedVR.vrgName))) + + vrgDeleteIncompleteVR.cleanupNamespace() + vrgDeleteIncompleteVR.cleanupSC() + vrgDeleteIncompleteVR.cleanupVRC() + }) + }) + + // Test VRG deletion when VR completed and Validated condition is missing (csi-addons < 0.10.0) + var vrgDeleteCompletedVR *vrgTest + //nolint:dupl + Context("VR failed validation in primary state and Validated condition is missing", func() { + createTestTemplate := &template{ + ClaimBindInfo: corev1.ClaimBound, + VolumeBindInfo: corev1.VolumeBound, + schedulingInterval: "1h", + storageClassName: "manual", + replicationClassName: "test-replicationclass", + vrcProvisioner: "manual.storage.com", + scProvisioner: "manual.storage.com", + replicationClassLabels: map[string]string{"protection": "ramen"}, + } + It("sets up PVCs, PVs and VRGs (with s3 stores that fail uploads)", func() { + createTestTemplate.s3Profiles = []string{s3Profiles[vrgS3ProfileNumber].S3ProfileName} + vrgDeleteCompletedVR = newVRGTestCaseCreateAndStart(1, createTestTemplate, true, false) + }) + It("waits for VRG to create a VR for each PVC", func() { + expectedVRCount := len(vrgDeleteFailedVR.pvcNames) + vrgDeleteCompletedVR.waitForVRCountToMatch(expectedVRCount) + }) + It("simulate completed VR", func() { + vrgDeleteCompletedVR.promoteVolRepsWithOptions(promoteOptions{ValidatedMissing: true}) + }) + It("VRG can be deleted", func() { + By("deleting the VRG") + vrg := vrgDeleteCompletedVR.getVRG() + Expect(k8sClient.Delete(context.TODO(), vrg)).To(Succeed()) + + By("ensuring the VRG is deleted") + Eventually(func() error { + return apiReader.Get(context.TODO(), vrgDeleteFailedVR.vrgNamespacedName(), vrg) + }, vrgtimeout, vrginterval). + Should(MatchError(errors.NewNotFound(schema.GroupResource{ + Group: ramendrv1alpha1.GroupVersion.Group, + Resource: "volumereplicationgroups", + }, vrgDeleteFailedVR.vrgName))) + + vrgDeleteCompletedVR.cleanupNamespace() + vrgDeleteCompletedVR.cleanupSC() + vrgDeleteCompletedVR.cleanupVRC() + }) + }) + // Try the simple case of creating VRG, PVC, PV and // check whether VolRep resources are created or not var vrgTestCases []*vrgTest @@ -2164,17 +2308,26 @@ func (v *vrgTest) waitForVRCountToMatch(vrCount int) { } func (v *vrgTest) promoteVolReps() { - v.promoteVolRepsAndDo(func(index, count int) { + v.promoteVolRepsAndDo(promoteOptions{}, func(index, count int) { // VRG should not be ready until last VolRep is ready. v.verifyVRGStatusExpectation(index == count-1, vrgController.VRGConditionReasonReady) }) } func (v *vrgTest) promoteVolRepsWithoutVrgStatusCheck() { - v.promoteVolRepsAndDo(func(index, count int) {}) + v.promoteVolRepsAndDo(promoteOptions{}, func(index, count int) {}) +} + +func (v *vrgTest) promoteVolRepsWithOptions(options promoteOptions) { + v.promoteVolRepsAndDo(options, func(index, count int) {}) +} + +type promoteOptions struct { + ValidatedMissing bool + ValidatedFailed bool } -func (v *vrgTest) promoteVolRepsAndDo(do func(int, int)) { +func (v *vrgTest) promoteVolRepsAndDo(options promoteOptions, do func(int, int)) { By("Promoting VolumeReplication resources " + v.namespace) volRepList := &volrep.VolumeReplicationList{} @@ -2188,33 +2341,17 @@ func (v *vrgTest) promoteVolRepsAndDo(do func(int, int)) { volRep := volRepList.Items[index] volRepStatus := volrep.VolumeReplicationStatus{ - Conditions: []metav1.Condition{ - { - Type: volrep.ConditionCompleted, - Reason: volrep.Promoted, - ObservedGeneration: volRep.Generation, - Status: metav1.ConditionTrue, - LastTransitionTime: metav1.NewTime(time.Now()), - }, - { - Type: volrep.ConditionDegraded, - Reason: volrep.Healthy, - ObservedGeneration: volRep.Generation, - Status: metav1.ConditionFalse, - LastTransitionTime: metav1.NewTime(time.Now()), - }, - { - Type: volrep.ConditionResyncing, - Reason: volrep.NotResyncing, - ObservedGeneration: volRep.Generation, - Status: metav1.ConditionFalse, - LastTransitionTime: metav1.NewTime(time.Now()), - }, - }, + Conditions: v.generateVRConditions(volRep.Generation, options), + ObservedGeneration: volRep.Generation, + State: volrep.PrimaryState, + Message: "volume is marked primary", + } + + if options.ValidatedFailed { + volRepStatus.State = volrep.UnknownState + volRepStatus.Message = "precondition failed ..." } - volRepStatus.ObservedGeneration = volRep.Generation - volRepStatus.State = volrep.PrimaryState - volRepStatus.Message = "volume is marked primary" + volRep.Status = volRepStatus err = k8sClient.Status().Update(context.TODO(), &volRep) @@ -2224,12 +2361,88 @@ func (v *vrgTest) promoteVolRepsAndDo(do func(int, int)) { Name: volRep.Name, Namespace: volRep.Namespace, } - v.waitForVolRepPromotion(volrepKey) + + if options.ValidatedFailed { + if options.ValidatedMissing { + v.waitForVolRepCondition(volrepKey, volrep.ConditionCompleted, metav1.ConditionFalse) + } else { + v.waitForVolRepCondition(volrepKey, volrep.ConditionValidated, metav1.ConditionFalse) + } + } else { + v.waitForVolRepCondition(volrepKey, volrep.ConditionCompleted, metav1.ConditionTrue) + v.waitForProtectedPVCs(volrepKey) + } do(index, len(volRepList.Items)) } } +func (v *vrgTest) generateVRConditions(generation int64, options promoteOptions) []metav1.Condition { + var conditions []metav1.Condition + + lastTransitionTime := metav1.NewTime(time.Now()) + + if !options.ValidatedMissing { + validated := metav1.Condition{ + Type: volrep.ConditionValidated, + Reason: volrep.PrerequisiteNotMet, + ObservedGeneration: generation, + Status: metav1.ConditionFalse, + LastTransitionTime: lastTransitionTime, + } + + if options.ValidatedFailed { + validated.Status = metav1.ConditionFalse + validated.Reason = volrep.PrerequisiteNotMet + } + + conditions = append(conditions, validated) + } + + completed := metav1.Condition{ + Type: volrep.ConditionCompleted, + Reason: volrep.Promoted, + ObservedGeneration: generation, + Status: metav1.ConditionTrue, + LastTransitionTime: lastTransitionTime, + } + + if options.ValidatedFailed { + completed.Status = metav1.ConditionFalse + completed.Reason = volrep.FailedToPromote + } + + degraded := metav1.Condition{ + Type: volrep.ConditionDegraded, + Reason: volrep.Healthy, + ObservedGeneration: generation, + Status: metav1.ConditionFalse, + LastTransitionTime: lastTransitionTime, + } + resyncing := metav1.Condition{ + Type: volrep.ConditionResyncing, + Reason: volrep.NotResyncing, + ObservedGeneration: generation, + Status: metav1.ConditionFalse, + LastTransitionTime: lastTransitionTime, + } + + return append(conditions, completed, degraded, resyncing) +} + +func (v *vrgTest) deleteVolReps() { + vrList := &volrep.VolumeReplicationList{} + err := k8sClient.List(context.TODO(), vrList, &client.ListOptions{Namespace: v.namespace}) + Expect(err).NotTo(HaveOccurred(), "failed to get a list of VRs in namespace %s", v.namespace) + + for i := range vrList.Items { + vr := vrList.Items[i] + + err := k8sClient.Delete(context.TODO(), &vr) + Expect(err).NotTo(HaveOccurred(), "failed to delete volRep %v/%s", vr.Namespace, vr.Name) + } +} + func (v *vrgTest) protectDeletionOfVolReps() { By("Adding a finalizer to protect VolumeReplication resources being deleted " + v.namespace) @@ -2268,23 +2481,36 @@ func (v *vrgTest) unprotectDeletionOfVolReps() { } } -func (v *vrgTest) waitForVolRepPromotion(vrNamespacedName types.NamespacedName) { +func (v *vrgTest) waitForVolRepCondition( + vrNamespacedName types.NamespacedName, + conditionType string, + conditionStatus metav1.ConditionStatus, +) { updatedVolRep := volrep.VolumeReplication{} Eventually(func() bool { err := k8sClient.Get(context.TODO(), vrNamespacedName, &updatedVolRep) + if err != nil { + return false + } - return err == nil && len(updatedVolRep.Status.Conditions) == 3 + condition := meta.FindStatusCondition(updatedVolRep.Status.Conditions, conditionType) + if condition == nil { + return false + } + + return condition.Status == conditionStatus }, vrgtimeout, vrginterval).Should(BeTrue(), - "failed to wait for volRep condition type to change to 'ConditionCompleted' (%d)", - len(updatedVolRep.Status.Conditions)) + "failed to wait for volRep condition %q to become %q", conditionType, conditionStatus) +} +func (v *vrgTest) waitForProtectedPVCs(vrNamespacedName types.NamespacedName) { Eventually(func() bool { vrg := v.getVRG() // as of now name of VolumeReplication resource created by the VolumeReplicationGroup // is same as the pvc that it replicates. When that changes this has to be changed to // use the right name to get the appropriate protected PVC condition from VRG status. - protectedPVC := vrgController.FindProtectedPVC(vrg, updatedVolRep.Namespace, updatedVolRep.Name) + protectedPVC := vrgController.FindProtectedPVC(vrg, vrNamespacedName.Namespace, vrNamespacedName.Name) // failed to get the protectedPVC. Returning false if protectedPVC == nil { @@ -2293,7 +2519,7 @@ func (v *vrgTest) waitForVolRepPromotion(vrNamespacedName types.NamespacedName) return v.checkProtectedPVCSuccess(vrg, protectedPVC) }, vrgtimeout, vrginterval).Should(BeTrue(), - "while waiting for protected pvc condition %s/%s", updatedVolRep.Namespace, updatedVolRep.Name) + "while waiting for protected pvc condition %s/%s", vrNamespacedName.Namespace, vrNamespacedName.Name) } func (v *vrgTest) checkProtectedPVCSuccess(vrg *ramendrv1alpha1.VolumeReplicationGroup, diff --git a/test/Makefile b/test/Makefile index 7008c364c..7b58f13e5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,6 +5,9 @@ # hardware acceleration for VMs. DRIVER ?= vm +# drenv start timeout in seconds +TIMEOUT ?= 600 + env := envs/$(DRIVER).yaml prefix := drenv-test- @@ -50,7 +53,7 @@ coverage-html: xdg-open htmlcov/index.html cluster: - drenv start --name-prefix $(prefix) $(env) -v + drenv start --name-prefix $(prefix) $(env) --verbose --timeout $(TIMEOUT) clean: drenv delete --name-prefix $(prefix) $(env) diff --git a/test/addons/rook-pool/start b/test/addons/rook-pool/start index d41502433..b06b5ca2e 100755 --- a/test/addons/rook-pool/start +++ b/test/addons/rook-pool/start @@ -15,9 +15,13 @@ from drenv import kubectl def deploy(cluster): print("Creating RBD pool and storage/snapshot classes") + + template = drenv.template("storage-class.yaml") + yaml = template.substitute(cluster=cluster) + + kubectl.apply("--filename=-", input=yaml, context=cluster) kubectl.apply( "--filename=replica-pool.yaml", - "--filename=storage-class.yaml", "--filename=snapshot-class.yaml", context=cluster, ) diff --git a/test/addons/rook-pool/storage-class.yaml b/test/addons/rook-pool/storage-class.yaml index 389fe2378..bb4786d80 100644 --- a/test/addons/rook-pool/storage-class.yaml +++ b/test/addons/rook-pool/storage-class.yaml @@ -7,7 +7,7 @@ kind: StorageClass metadata: name: rook-ceph-block labels: - ramendr.openshift.io/storageid: rook-ceph-storage-id + ramendr.openshift.io/storageid: rook-ceph-$cluster-1 provisioner: rook-ceph.rbd.csi.ceph.com parameters: clusterID: rook-ceph diff --git a/test/drenv/__main__.py b/test/drenv/__main__.py index d81107262..7fd709483 100644 --- a/test/drenv/__main__.py +++ b/test/drenv/__main__.py @@ -76,6 +76,11 @@ def parse_args(): metavar="N", help="maximum number of workers per profile", ) + p.add_argument( + "--timeout", + type=int, + help="time in seconds to wait until clsuter is started", + ) p = add_command(sp, "stop", do_stop, help="stop an environment") p.add_argument( @@ -379,7 +384,7 @@ def start_cluster(profile, hooks=(), args=None, **options): provider = providers.get(profile["provider"]) existing = provider.exists(profile) - provider.start(profile, verbose=args.verbose) + provider.start(profile, verbose=args.verbose, timeout=args.timeout) provider.configure(profile, existing=existing) if existing: diff --git a/test/drenv/envfile.py b/test/drenv/envfile.py index 2c8c19218..d166a7325 100644 --- a/test/drenv/envfile.py +++ b/test/drenv/envfile.py @@ -46,11 +46,11 @@ }, "darwin": { PROVIDER: { - "x86_64": "minikube", + "x86_64": "lima", "arm64": "lima", }, VM: { - "x86_64": "hyperkit", + "x86_64": "", "arm64": "", }, CONTAINER: "podman", @@ -136,7 +136,7 @@ def _validate_profile(profile, addons_root): # If True, this is an external cluster and we don't have to start it. profile.setdefault("external", False) - # Properties for drenv managed cluster. + # Common properties. profile.setdefault("provider", PROVIDER) profile.setdefault("driver", VM) profile.setdefault("container_runtime", "") @@ -155,6 +155,9 @@ def _validate_profile(profile, addons_root): profile.setdefault("containerd", None) profile.setdefault("workers", []) + # Lima provider properties. + profile.setdefault("rosetta", True) + _validate_platform_defaults(profile) for i, worker in enumerate(profile["workers"]): diff --git a/test/drenv/kubeconfig.py b/test/drenv/kubeconfig.py index 06c3bc8f3..cf2c78447 100644 --- a/test/drenv/kubeconfig.py +++ b/test/drenv/kubeconfig.py @@ -51,7 +51,7 @@ def remove(profile, target=DEFAULT_CONFIG): return for k in ("contexts", "clusters", "users"): - old = config.get(k, []) + old = config.get(k) or [] new = [v for v in old if v["name"] != profile["name"]] if len(new) < len(old): config[k] = new diff --git a/test/drenv/providers/external.py b/test/drenv/providers/external.py index 2ed2a0cd9..d1f0ae835 100644 --- a/test/drenv/providers/external.py +++ b/test/drenv/providers/external.py @@ -25,7 +25,7 @@ def exists(profile): return True -def start(profile, verbose=False): +def start(profile, verbose=False, timeout=None): start = time.monotonic() logging.info("[%s] Checking external cluster status", profile["name"]) diff --git a/test/drenv/providers/lima/__init__.py b/test/drenv/providers/lima/__init__.py index 7e6570f89..d2db1c75c 100644 --- a/test/drenv/providers/lima/__init__.py +++ b/test/drenv/providers/lima/__init__.py @@ -61,7 +61,7 @@ def exists(profile): return False -def start(profile, verbose=False): +def start(profile, verbose=False, timeout=None): start = time.monotonic() logging.info("[%s] Starting lima cluster", profile["name"]) @@ -76,7 +76,7 @@ def start(profile, verbose=False): # Get vm before starting to detect a stopped vm. vm = _get_vm(profile) - _start_vm(profile) + _start_vm(profile, timeout=timeout) _add_kubeconfig(profile, vm) debug = partial(logging.debug, f"[{profile['name']}] %s") @@ -181,7 +181,9 @@ def _write_config(profile, path): # The "vz" type is required to support amd64 images on arm64, needed for # OCM, and also provide the best performance. config["vmType"] = "vz" - config["rosetta"] = {"enabled": True, "binfmt": True} + + if profile["rosetta"]: + config["rosetta"] = {"enabled": True, "binfmt": True} # We always use socket_vmnet to get shared network. config["networks"] = [{"socket": "/var/run/socket_vmnet"}] @@ -270,8 +272,12 @@ def _create_vm(profile, config): _watch("create", "--name", profile["name"], config, context=profile["name"]) -def _start_vm(profile): - _watch("start", profile["name"], context=profile["name"]) +def _start_vm(profile, timeout=None): + args = ["start"] + if timeout: + args.append(f"--timeout={timeout}s") + args.append(profile["name"]) + _watch(*args, context=profile["name"]) def _stop_vm(profile): diff --git a/test/drenv/providers/lima/k8s.yaml b/test/drenv/providers/lima/k8s.yaml index d578755ec..3d016bc5c 100644 --- a/test/drenv/providers/lima/k8s.yaml +++ b/test/drenv/providers/lima/k8s.yaml @@ -13,6 +13,8 @@ images: - location: "https://cloud-images.ubuntu.com/releases/24.04/release/ubuntu-24.04-server-cloudimg-arm64.img" arch: "aarch64" + - location: "https://cloud-images.ubuntu.com/releases/24.04/release/ubuntu-24.04-server-cloudimg-amd64.img" + arch: "x86_64" mounts: [] @@ -24,23 +26,11 @@ containerd: # forwarding cannot work for multiple clusters since same port from multiple # clusters is mapped to the same host port. portForwards: - - guestPortRange: [1, 65535] - guestIP: "0.0.0.0" - ignore: true + - ignore: true + proto: any provision: - - mode: system - script: | - #!/bin/bash - set -eux -o pipefail - echo "Promoting default route on shared network" - route=$(ip route show default dev lima0) - if [ -n "$route" ] && ! echo $route | grep -E -q "metric 1\b"; then - ip route add $route metric 1 - ip route del $route - fi - # See - mode: system script: | @@ -48,20 +38,20 @@ provision: set -eux -o pipefail command -v kubeadm >/dev/null 2>&1 && exit 0 # Install and configure prerequisites - cat </dev/null 2>&1; do sleep 3; done"; then - echo >&2 "kubernetes cluster is not up and running yet" + if ! timeout 300s bash -c "until kubectl get --raw /readyz >/dev/null 2>&1; do sleep 3; done"; then + echo >&2 "kubernetes cluster is not ready yet" exit 1 fi - - description: "coredns deployment to be running" - script: | - #!/bin/bash - set -eux -o pipefail - kubectl wait -n kube-system --timeout=180s --for=condition=available deploy coredns - copyToHost: - guest: "/etc/kubernetes/admin.conf" host: "{{.Dir}}/copied-from-guest/kubeconfig.yaml" diff --git a/test/drenv/providers/minikube.py b/test/drenv/providers/minikube.py index 480ceb3af..f03bd39f5 100644 --- a/test/drenv/providers/minikube.py +++ b/test/drenv/providers/minikube.py @@ -61,7 +61,7 @@ def exists(profile): return False -def start(profile, verbose=False): +def start(profile, verbose=False, timeout=None): start = time.monotonic() logging.info("[%s] Starting minikube cluster", profile["name"]) @@ -119,7 +119,7 @@ def start(profile, verbose=False): # TODO: Use --interactive=false when the bug is fixed. # https://github.com/kubernetes/minikube/issues/19518 - _watch("start", *args, profile=profile["name"]) + _watch("start", *args, profile=profile["name"], timeout=timeout) logging.info( "[%s] Cluster started in %.2f seconds", @@ -364,11 +364,11 @@ def _run(command, *args, profile=None, output=None): return commands.run(*cmd) -def _watch(command, *args, profile=None): +def _watch(command, *args, profile=None, timeout=None): cmd = ["minikube", command, "--profile", profile] cmd.extend(args) logging.debug("[%s] Running %s", profile, cmd) - for line in commands.watch(*cmd): + for line in commands.watch(*cmd, timeout=timeout): logging.debug("[%s] %s", profile, line) diff --git a/test/envs/vm.yaml b/test/envs/vm.yaml index 85da2db03..d947ba806 100644 --- a/test/envs/vm.yaml +++ b/test/envs/vm.yaml @@ -8,7 +8,9 @@ profiles: - name: cluster driver: $vm container_runtime: containerd - memory: "3g" + cpus: 1 + memory: "2g" + rosetta: false workers: - addons: - name: example