From 1f7805e079cea6c74067abb66055bf0d1c22cc9b Mon Sep 17 00:00:00 2001 From: Lukas Piwowarski Date: Wed, 30 Oct 2024 10:18:15 -0400 Subject: [PATCH] Limit pod resources This patch introduces back limits for the pods spawned by the test-operator after they were increased and later removed with these two PRs [1][2]. The problem with the previous two patches was that they only set the Resources.Limits field and not the Resources.Requests field. When Resources.Limits is set and Resources.Requests is empty then it inherrits the value from Resources.Limits. Therefore, we first hit the OOM killed issue when we set the Resources.Limits too low and later when we increased the value we hit the "Insufficient memory" error (due to high value in Resources.Requests field) This patch addresses the above mentioned issue by: - setting sane default values for Resource.Limits - setting sane default values for Resource.Requests and - introduces new parameter called .Spec.Resources which can be used to change the default values. [1] https://github.com/openstack-k8s-operators/test-operator/pull/222 [2] https://github.com/openstack-k8s-operators/test-operator/pull/224 --- api/v1beta1/common.go | 10 ++++++++++ api/v1beta1/zz_generated.deepcopy.go | 6 ++++++ controllers/ansibletest_controller.go | 5 +++++ controllers/horizontest_controller.go | 1 + controllers/tempest_controller.go | 5 +++++ controllers/tobiko_controller.go | 5 +++++ pkg/ansibletest/job.go | 15 +++++++++++++++ pkg/horizontest/job.go | 15 +++++++++++++++ pkg/tempest/job.go | 15 +++++++++++++++ pkg/tobiko/job.go | 15 +++++++++++++++ pkg/util/common.go | 19 +++++++++++++++++++ 11 files changed, 111 insertions(+) diff --git a/api/v1beta1/common.go b/api/v1beta1/common.go index 6222b1c..4ceb1d7 100644 --- a/api/v1beta1/common.go +++ b/api/v1beta1/common.go @@ -95,6 +95,11 @@ type CommonOptions struct { // This value contains a toleration that is applied to pods spawned by the // test pods that are spawned by the test-operator. Tolerations []corev1.Toleration `json:"tolerations,omitempty"` + + // +kubebuilder:validation:Optional + // +operator-sdk:csv:customresourcedefinitions:type=spec + // Specify Resource Requirements for a each test pod spawned + Resources corev1.ResourceRequirements `json:"resources,omitempty"` } type CommonOpenstackConfig struct { @@ -179,4 +184,9 @@ type WorkflowCommonParameters struct { // This value contains a toleration that is applied to pods spawned by the // test pods that are spawned by the test-operator. Tolerations *[]corev1.Toleration `json:"tolerations,omitempty"` + + // +kubebuilder:validation:Optional + // +operator-sdk:csv:customresourcedefinitions:type=spec + // Specify Resource Requirements for a each test pod spawned + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` } diff --git a/api/v1beta1/zz_generated.deepcopy.go b/api/v1beta1/zz_generated.deepcopy.go index 211c2d7..208295b 100644 --- a/api/v1beta1/zz_generated.deepcopy.go +++ b/api/v1beta1/zz_generated.deepcopy.go @@ -169,6 +169,7 @@ func (in *CommonOptions) DeepCopyInto(out *CommonOptions) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + in.ResourceRequirements.DeepCopyInto(&out.ResourceRequirements) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CommonOptions. @@ -690,6 +691,11 @@ func (in *WorkflowCommonParameters) DeepCopyInto(out *WorkflowCommonParameters) } } } + if in.ResourceRequirements != nil { + in, out := &in.ResourceRequirements, &out.ResourceRequirements + *out = new(v1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkflowCommonParameters. diff --git a/controllers/ansibletest_controller.go b/controllers/ansibletest_controller.go index b32ea14..e9cdcf4 100644 --- a/controllers/ansibletest_controller.go +++ b/controllers/ansibletest_controller.go @@ -240,6 +240,10 @@ func (r *AnsibleTestReconciler) Reconcile(ctx context.Context, req ctrl.Request) if instance.Spec.Workflow[externalWorkflowCounter].SELinuxLevel != nil { instance.Spec.SELinuxLevel = *instance.Spec.Workflow[externalWorkflowCounter].SELinuxLevel } + + if instance.Spec.Workflow[externalWorkflowCounter].Resources != nil { + instance.Spec.Resources = *instance.Spec.Workflow[externalWorkflowCounter].Resources + } } // Service account, role, binding @@ -263,6 +267,7 @@ func (r *AnsibleTestReconciler) Reconcile(ctx context.Context, req ctrl.Request) externalWorkflowCounter, containerImage, privileged, + instance.Spec.Resources, ) ansibleTestsJob := job.NewJob( jobDef, diff --git a/controllers/horizontest_controller.go b/controllers/horizontest_controller.go index 7f8b7ab..9a15c1e 100644 --- a/controllers/horizontest_controller.go +++ b/controllers/horizontest_controller.go @@ -230,6 +230,7 @@ func (r *HorizonTestReconciler) Reconcile(ctx context.Context, req ctrl.Request) mountKubeconfig, envVars, containerImage, + instance.Spec.Resources, ) horizontestJob := job.NewJob( jobDef, diff --git a/controllers/tempest_controller.go b/controllers/tempest_controller.go index 9975730..2a33320 100644 --- a/controllers/tempest_controller.go +++ b/controllers/tempest_controller.go @@ -331,6 +331,10 @@ func (r *TempestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (re if instance.Spec.Workflow[externalWorkflowCounter].SELinuxLevel != nil { instance.Spec.SELinuxLevel = *instance.Spec.Workflow[externalWorkflowCounter].SELinuxLevel } + + if instance.Spec.Workflow[externalWorkflowCounter].Resources != nil { + instance.Spec.Resources = *instance.Spec.Workflow[externalWorkflowCounter].Resources + } } jobDef := tempest.Job( @@ -344,6 +348,7 @@ func (r *TempestReconciler) Reconcile(ctx context.Context, req ctrl.Request) (re mountCerts, mountSSHKey, containerImage, + instance.Spec.Resources, ) tempestJob := job.NewJob( jobDef, diff --git a/controllers/tobiko_controller.go b/controllers/tobiko_controller.go index a9b8e02..9aeb2ed 100644 --- a/controllers/tobiko_controller.go +++ b/controllers/tobiko_controller.go @@ -317,6 +317,7 @@ func (r *TobikoReconciler) Reconcile(ctx context.Context, req ctrl.Request) (res envVars, containerImage, privileged, + instance.Spec.Resources, ) tobikoJob := job.NewJob( jobDef, @@ -426,6 +427,10 @@ func (r *TobikoReconciler) PrepareTobikoEnvVars( if instance.Spec.Workflow[step].SELinuxLevel != nil { instance.Spec.SELinuxLevel = *instance.Spec.Workflow[step].SELinuxLevel } + + if instance.Spec.Workflow[step].Resources != nil { + instance.Spec.Resources = *instance.Spec.Workflow[step].Resources + } } // Prepare env vars diff --git a/pkg/ansibletest/job.go b/pkg/ansibletest/job.go index b88e9aa..390b374 100644 --- a/pkg/ansibletest/job.go +++ b/pkg/ansibletest/job.go @@ -7,6 +7,7 @@ import ( util "github.com/openstack-k8s-operators/test-operator/pkg/util" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -22,6 +23,7 @@ func Job( externalWorkflowCounter int, containerImage string, privileged bool, + resources corev1.ResourceRequirements, ) *batchv1.Job { runAsUser := int64(227) @@ -32,6 +34,18 @@ func Job( capabilities := []corev1.Capability{"NET_ADMIN", "NET_RAW"} securityContext := util.GetSecurityContext(runAsUser, capabilities, privileged) + defaultResources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4000Mi"), + corev1.ResourceMemory: resource.MustParse("10Gi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2000Mi"), + corev1.ResourceMemory: resource.MustParse("5Gi"), + }, + } + resources = util.GetResourceRequirements(resources, defaultResources) + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, @@ -64,6 +78,7 @@ func Job( Env: env.MergeEnvs([]corev1.EnvVar{}, envVars), VolumeMounts: GetVolumeMounts(mountCerts, instance, externalWorkflowCounter), SecurityContext: &securityContext, + Resources: resources, }, }, Volumes: GetVolumes( diff --git a/pkg/horizontest/job.go b/pkg/horizontest/job.go index 5a9123d..1ae8c72 100644 --- a/pkg/horizontest/job.go +++ b/pkg/horizontest/job.go @@ -7,6 +7,7 @@ import ( util "github.com/openstack-k8s-operators/test-operator/pkg/util" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -21,6 +22,7 @@ func Job( mountKubeconfig bool, envVars map[string]env.Setter, containerImage string, + resources corev1.ResourceRequirements, ) *batchv1.Job { runAsUser := int64(42455) @@ -31,6 +33,18 @@ func Job( capabilities := []corev1.Capability{"NET_ADMIN", "NET_RAW"} securityContext := util.GetSecurityContext(runAsUser, capabilities, instance.Spec.Privileged) + defaultResources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4000Mi"), + corev1.ResourceMemory: resource.MustParse("10Gi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2000Mi"), + corev1.ResourceMemory: resource.MustParse("5Gi"), + }, + } + resources = util.GetResourceRequirements(resources, defaultResources) + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, @@ -63,6 +77,7 @@ func Job( Env: env.MergeEnvs([]corev1.EnvVar{}, envVars), VolumeMounts: GetVolumeMounts(mountCerts, mountKeys, mountKubeconfig, instance), SecurityContext: &securityContext, + Resources: resources, }, }, Volumes: GetVolumes( diff --git a/pkg/tempest/job.go b/pkg/tempest/job.go index 6e73a11..31cae4c 100644 --- a/pkg/tempest/job.go +++ b/pkg/tempest/job.go @@ -7,6 +7,7 @@ import ( util "github.com/openstack-k8s-operators/test-operator/pkg/util" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -22,6 +23,7 @@ func Job( mountCerts bool, mountSSHKey bool, containerImage string, + resources corev1.ResourceRequirements, ) *batchv1.Job { envVars := map[string]env.Setter{} @@ -29,6 +31,18 @@ func Job( runAsGroup := int64(42480) securityContext := util.GetSecurityContext(runAsUser, []corev1.Capability{}, instance.Spec.Privileged) + defaultResources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4000Mi"), + corev1.ResourceMemory: resource.MustParse("10Gi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2000Mi"), + corev1.ResourceMemory: resource.MustParse("5Gi"), + }, + } + resources = util.GetResourceRequirements(resources, defaultResources) + job := &batchv1.Job{ ObjectMeta: metav1.ObjectMeta{ Name: jobName, @@ -60,6 +74,7 @@ func Job( Env: env.MergeEnvs([]corev1.EnvVar{}, envVars), VolumeMounts: GetVolumeMounts(mountCerts, mountSSHKey, instance), SecurityContext: &securityContext, + Resources: resources, EnvFrom: []corev1.EnvFromSource{ { ConfigMapRef: &corev1.ConfigMapEnvSource{ diff --git a/pkg/tobiko/job.go b/pkg/tobiko/job.go index 21c2f97..cbbfcd0 100644 --- a/pkg/tobiko/job.go +++ b/pkg/tobiko/job.go @@ -7,6 +7,7 @@ import ( util "github.com/openstack-k8s-operators/test-operator/pkg/util" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) @@ -23,6 +24,7 @@ func Job( envVars map[string]env.Setter, containerImage string, privileged bool, + resources corev1.ResourceRequirements, ) *batchv1.Job { runAsUser := int64(42495) @@ -33,6 +35,18 @@ func Job( capabilities := []corev1.Capability{"NET_ADMIN", "NET_RAW"} securityContext := util.GetSecurityContext(runAsUser, capabilities, privileged) + defaultResources := corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("4000Mi"), + corev1.ResourceMemory: resource.MustParse("10Gi"), + }, + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("2000Mi"), + corev1.ResourceMemory: resource.MustParse("5Gi"), + }, + } + resources = util.GetResourceRequirements(resources, defaultResources) + // Note(lpiwowar): Once the webhook is implemented move all the logic of merging // the workflows there. job := &batchv1.Job{ @@ -68,6 +82,7 @@ func Job( Env: env.MergeEnvs([]corev1.EnvVar{}, envVars), VolumeMounts: GetVolumeMounts(mountCerts, mountKeys, mountKubeconfig, instance), SecurityContext: &securityContext, + Resources: resources, }, }, Volumes: GetVolumes( diff --git a/pkg/util/common.go b/pkg/util/common.go index 02c294a..014e5c7 100644 --- a/pkg/util/common.go +++ b/pkg/util/common.go @@ -36,3 +36,22 @@ func GetSecurityContext( return securityContext } + +func GetResourceRequirements( + resourceRequirements corev1.ResourceRequirements, + defaultResourceRequirements corev1.ResourceRequirements, +) corev1.ResourceRequirements { + if resourceRequirements.Limits == nil { + resourceRequirements.Limits = defaultResourceRequirements.Limits + } + + if resourceRequirements.Requests == nil { + resourceRequirements.Requests = defaultResourceRequirements.Requests + } + + if resourceRequirements.Claims == nil { + resourceRequirements.Claims = defaultResourceRequirements.Claims + } + + return resourceRequirements +}