From 51408e5e66bda5ddf528431b94f41be22e954ed6 Mon Sep 17 00:00:00 2001 From: Akshay Chitneni Date: Mon, 28 Oct 2024 10:18:21 -0700 Subject: [PATCH] Adding cel validation on trainingRuntime CRD --- .../kubeflow.org_clustertrainingruntimes.yaml | 13 ++ .../crds/kubeflow.org_trainingruntimes.yaml | 13 ++ .../v2alpha1/trainingruntime_types.go | 7 + .../trainingruntime_controller_test.go | 132 ++++++++++++++++++ .../controller.v2/trainjob_controller_test.go | 2 + 5 files changed, 167 insertions(+) create mode 100644 test/integration/controller.v2/trainingruntime_controller_test.go diff --git a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml index f28b08b6c5..ba76ca7c6c 100644 --- a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml +++ b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml @@ -53,6 +53,7 @@ spec: description: Directory where SSH keys are mounted. type: string mpiImplementation: + default: OpenMPI description: |- Implementation name for the MPI to create the appropriate hostfile. Defaults to OpenMPI. @@ -64,6 +65,7 @@ spec: format: int32 type: integer runLauncherAsNode: + default: false description: |- Whether to run training process on the launcher Job. Defaults to false. @@ -576,6 +578,7 @@ spec: type: integer type: object numProcPerNode: + default: auto description: |- Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. @@ -583,8 +586,17 @@ spec: TODO (andreyvelich): Add kubebuilder validation. Defaults to `auto`. type: string + x-kubernetes-validations: + - message: NumProcPerNode must be auto,cpu,gpu strings or + int value + rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int type: object type: object + x-kubernetes-validations: + - message: numNodes should not be set if torch.elasticPolicy is configured + rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))' + - message: Only one of the policy can be configured + rule: '!(has(self.torch) && has(self.mpi))' podGroupPolicy: description: Configuration for the PodGroup to enable gang-scheduling via supported plugins. @@ -594,6 +606,7 @@ spec: for gang-scheduling. properties: scheduleTimeoutSeconds: + default: 60 description: |- Time threshold to schedule PodGroup for gang-scheduling. If the scheduling timeout is equal to 0, the default value is used. diff --git a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml index d5769ed073..1f2b6beb50 100644 --- a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml +++ b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml @@ -53,6 +53,7 @@ spec: description: Directory where SSH keys are mounted. type: string mpiImplementation: + default: OpenMPI description: |- Implementation name for the MPI to create the appropriate hostfile. Defaults to OpenMPI. @@ -64,6 +65,7 @@ spec: format: int32 type: integer runLauncherAsNode: + default: false description: |- Whether to run training process on the launcher Job. Defaults to false. @@ -576,6 +578,7 @@ spec: type: integer type: object numProcPerNode: + default: auto description: |- Number of processes per node. This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI. @@ -583,8 +586,17 @@ spec: TODO (andreyvelich): Add kubebuilder validation. Defaults to `auto`. type: string + x-kubernetes-validations: + - message: NumProcPerNode must be auto,cpu,gpu strings or + int value + rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int type: object type: object + x-kubernetes-validations: + - message: numNodes should not be set if torch.elasticPolicy is configured + rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))' + - message: Only one of the policy can be configured + rule: '!(has(self.torch) && has(self.mpi))' podGroupPolicy: description: Configuration for the PodGroup to enable gang-scheduling via supported plugins. @@ -594,6 +606,7 @@ spec: for gang-scheduling. properties: scheduleTimeoutSeconds: + default: 60 description: |- Time threshold to schedule PodGroup for gang-scheduling. If the scheduling timeout is equal to 0, the default value is used. diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go index c25e2c49b3..c042074497 100644 --- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go +++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go @@ -142,10 +142,13 @@ type CoschedulingPodGroupPolicySource struct { // Time threshold to schedule PodGroup for gang-scheduling. // If the scheduling timeout is equal to 0, the default value is used. // Defaults to 60 seconds. + // +kubebuilder:default=60 ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"` } // MLPolicy represents configuration for the model trining with ML-specific parameters. +// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured" +// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured" type MLPolicy struct { // Number of training nodes. // Defaults to 1. @@ -173,6 +176,8 @@ type TorchMLPolicySource struct { // Supported values: `auto`, `cpu`, `gpu`, or int value. // TODO (andreyvelich): Add kubebuilder validation. // Defaults to `auto`. + // +kubebuilder:default="auto" + // +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be auto,cpu,gpu strings or int value" NumProcPerNode *string `json:"numProcPerNode,omitempty"` // Elastic policy for the PyTorch training. @@ -209,6 +214,7 @@ type MPIMLPolicySource struct { // Implementation name for the MPI to create the appropriate hostfile. // Defaults to OpenMPI. + // +kubebuilder:default="OpenMPI" MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"` // Directory where SSH keys are mounted. @@ -216,6 +222,7 @@ type MPIMLPolicySource struct { // Whether to run training process on the launcher Job. // Defaults to false. + // +kubebuilder:default=false RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"` } diff --git a/test/integration/controller.v2/trainingruntime_controller_test.go b/test/integration/controller.v2/trainingruntime_controller_test.go new file mode 100644 index 0000000000..a656ceaf09 --- /dev/null +++ b/test/integration/controller.v2/trainingruntime_controller_test.go @@ -0,0 +1,132 @@ +/* +Copyright 2024 The Kubeflow Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllerv2 + +import ( + kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1" + testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing" + "github.com/kubeflow/training-operator/test/integration/framework" + "github.com/kubeflow/training-operator/test/util" + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" +) + +var _ = ginkgo.Describe("TrainingRuntime marker validations and defaulting", ginkgo.Ordered, func() { + var ns *corev1.Namespace + + ginkgo.BeforeAll(func() { + fwk = &framework.Framework{} + cfg = fwk.Init() + ctx, k8sClient = fwk.RunManager(cfg) + }) + ginkgo.AfterAll(func() { + fwk.Teardown() + }) + + ginkgo.BeforeEach(func() { + ns = &corev1.Namespace{ + TypeMeta: metav1.TypeMeta{ + APIVersion: corev1.SchemeGroupVersion.String(), + Kind: "Namespace", + }, + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "training-runtime-marker-", + }, + } + gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed()) + }) + ginkgo.AfterEach(func() { + gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainingRuntime{}, client.InNamespace(ns.Name))).Should(gomega.Succeed()) + gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.ClusterTrainingRuntime{})).Should(gomega.Succeed()) + }) + + ginkgo.When("Creating TrainingRuntime", func() { + ginkgo.DescribeTable("Validate TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, errorMatcher gomega.OmegaMatcher) { + gomega.Expect(k8sClient.Create(ctx, trainingRuntime())).Should(errorMatcher) + }, + ginkgo.Entry("Should succeed to create trainingRuntime", + func() *kubeflowv2.TrainingRuntime { + return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime"). + Obj() + }, + gomega.Succeed()), + ginkgo.Entry("Should succeed to create clusterTrainingRuntime", + func() *kubeflowv2.TrainingRuntime { + return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime"). + Obj() + }, + gomega.Succeed()), + ginkgo.Entry("Should fail to create trainingRuntime with both MPI and Torch runtimes", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{}, + MPI: &kubeflowv2.MPIMLPolicySource{}, + }, + } + return runtime + }, + testingutil.BeInvalidError()), + ginkgo.Entry("Should fail to create trainingRuntime with minNodes and torch.elasticPolicy", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + NumNodes: ptr.To(int32(2)), + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{ + ElasticPolicy: &kubeflowv2.TorchElasticPolicy{}, + }, + }, + } + return runtime + }, + testingutil.BeInvalidError()), + ) + ginkgo.DescribeTable("Defaulting TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, wantTrainingRuntime func() *kubeflowv2.TrainingRuntime) { + created := trainingRuntime() + gomega.Expect(k8sClient.Create(ctx, created)).Should(gomega.Succeed()) + gomega.Expect(created).Should(gomega.BeComparableTo(wantTrainingRuntime(), util.IgnoreObjectMetadata)) + }, + ginkgo.Entry("Should succeed to default TorchMLPolicySource.NumProcPerNode=auto", + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{}, + }, + } + return runtime + }, + func() *kubeflowv2.TrainingRuntime { + runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj() + runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{ + MLPolicySource: kubeflowv2.MLPolicySource{ + Torch: &kubeflowv2.TorchMLPolicySource{ + NumProcPerNode: ptr.To("auto"), + }, + }, + } + return runtime + }), + ) + }) +}) diff --git a/test/integration/controller.v2/trainjob_controller_test.go b/test/integration/controller.v2/trainjob_controller_test.go index 098ae39c39..cc7f85c158 100644 --- a/test/integration/controller.v2/trainjob_controller_test.go +++ b/test/integration/controller.v2/trainjob_controller_test.go @@ -134,6 +134,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() { MinResources(corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1500"), }). + SchedulingTimeout(60). ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), trainJobKey.Name, string(trainJob.UID)). Obj(), util.IgnoreObjectMetadata)) @@ -189,6 +190,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() { MinResources(corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("1500"), }). + SchedulingTimeout(60). ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), trainJobKey.Name, string(trainJob.UID)). Obj(), util.IgnoreObjectMetadata))