From 51408e5e66bda5ddf528431b94f41be22e954ed6 Mon Sep 17 00:00:00 2001
From: Akshay Chitneni <achitneni@apple.com>
Date: Mon, 28 Oct 2024 10:18:21 -0700
Subject: [PATCH] Adding cel validation on trainingRuntime CRD

---
 .../kubeflow.org_clustertrainingruntimes.yaml |  13 ++
 .../crds/kubeflow.org_trainingruntimes.yaml   |  13 ++
 .../v2alpha1/trainingruntime_types.go         |   7 +
 .../trainingruntime_controller_test.go        | 132 ++++++++++++++++++
 .../controller.v2/trainjob_controller_test.go |   2 +
 5 files changed, 167 insertions(+)
 create mode 100644 test/integration/controller.v2/trainingruntime_controller_test.go

diff --git a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
index f28b08b6c5..ba76ca7c6c 100644
--- a/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_clustertrainingruntimes.yaml
@@ -53,6 +53,7 @@ spec:
                         description: Directory where SSH keys are mounted.
                         type: string
                       mpiImplementation:
+                        default: OpenMPI
                         description: |-
                           Implementation name for the MPI to create the appropriate hostfile.
                           Defaults to OpenMPI.
@@ -64,6 +65,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -576,6 +578,7 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
@@ -583,8 +586,17 @@ spec:
                           TODO (andreyvelich): Add kubebuilder validation.
                           Defaults to `auto`.
                         type: string
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be auto,cpu,gpu strings or
+                            int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -594,6 +606,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.
diff --git a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
index d5769ed073..1f2b6beb50 100644
--- a/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
+++ b/manifests/v2/base/crds/kubeflow.org_trainingruntimes.yaml
@@ -53,6 +53,7 @@ spec:
                         description: Directory where SSH keys are mounted.
                         type: string
                       mpiImplementation:
+                        default: OpenMPI
                         description: |-
                           Implementation name for the MPI to create the appropriate hostfile.
                           Defaults to OpenMPI.
@@ -64,6 +65,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -576,6 +578,7 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
@@ -583,8 +586,17 @@ spec:
                           TODO (andreyvelich): Add kubebuilder validation.
                           Defaults to `auto`.
                         type: string
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be auto,cpu,gpu strings or
+                            int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -594,6 +606,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.
diff --git a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
index c25e2c49b3..c042074497 100644
--- a/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
+++ b/pkg/apis/kubeflow.org/v2alpha1/trainingruntime_types.go
@@ -142,10 +142,13 @@ type CoschedulingPodGroupPolicySource struct {
 	// Time threshold to schedule PodGroup for gang-scheduling.
 	// If the scheduling timeout is equal to 0, the default value is used.
 	// Defaults to 60 seconds.
+	// +kubebuilder:default=60
 	ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
 }
 
 // MLPolicy represents configuration for the model trining with ML-specific parameters.
+// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
+// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
 type MLPolicy struct {
 	// Number of training nodes.
 	// Defaults to 1.
@@ -173,6 +176,8 @@ type TorchMLPolicySource struct {
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
 	// TODO (andreyvelich): Add kubebuilder validation.
 	// Defaults to `auto`.
+	// +kubebuilder:default="auto"
+	// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be auto,cpu,gpu strings or int value"
 	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
 
 	// Elastic policy for the PyTorch training.
@@ -209,6 +214,7 @@ type MPIMLPolicySource struct {
 
 	// Implementation name for the MPI to create the appropriate hostfile.
 	// Defaults to OpenMPI.
+	// +kubebuilder:default="OpenMPI"
 	MPIImplementation *MPIImplementation `json:"mpiImplementation,omitempty"`
 
 	// Directory where SSH keys are mounted.
@@ -216,6 +222,7 @@ type MPIMLPolicySource struct {
 
 	// Whether to run training process on the launcher Job.
 	// Defaults to false.
+	// +kubebuilder:default=false
 	RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
 }
 
diff --git a/test/integration/controller.v2/trainingruntime_controller_test.go b/test/integration/controller.v2/trainingruntime_controller_test.go
new file mode 100644
index 0000000000..a656ceaf09
--- /dev/null
+++ b/test/integration/controller.v2/trainingruntime_controller_test.go
@@ -0,0 +1,132 @@
+/*
+Copyright 2024 The Kubeflow Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package controllerv2
+
+import (
+	kubeflowv2 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v2alpha1"
+	testingutil "github.com/kubeflow/training-operator/pkg/util.v2/testing"
+	"github.com/kubeflow/training-operator/test/integration/framework"
+	"github.com/kubeflow/training-operator/test/util"
+	"github.com/onsi/ginkgo/v2"
+	"github.com/onsi/gomega"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+)
+
+var _ = ginkgo.Describe("TrainingRuntime marker validations and defaulting", ginkgo.Ordered, func() {
+	var ns *corev1.Namespace
+
+	ginkgo.BeforeAll(func() {
+		fwk = &framework.Framework{}
+		cfg = fwk.Init()
+		ctx, k8sClient = fwk.RunManager(cfg)
+	})
+	ginkgo.AfterAll(func() {
+		fwk.Teardown()
+	})
+
+	ginkgo.BeforeEach(func() {
+		ns = &corev1.Namespace{
+			TypeMeta: metav1.TypeMeta{
+				APIVersion: corev1.SchemeGroupVersion.String(),
+				Kind:       "Namespace",
+			},
+			ObjectMeta: metav1.ObjectMeta{
+				GenerateName: "training-runtime-marker-",
+			},
+		}
+		gomega.Expect(k8sClient.Create(ctx, ns)).To(gomega.Succeed())
+	})
+	ginkgo.AfterEach(func() {
+		gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.TrainingRuntime{}, client.InNamespace(ns.Name))).Should(gomega.Succeed())
+		gomega.Expect(k8sClient.DeleteAllOf(ctx, &kubeflowv2.ClusterTrainingRuntime{})).Should(gomega.Succeed())
+	})
+
+	ginkgo.When("Creating TrainingRuntime", func() {
+		ginkgo.DescribeTable("Validate TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, errorMatcher gomega.OmegaMatcher) {
+			gomega.Expect(k8sClient.Create(ctx, trainingRuntime())).Should(errorMatcher)
+		},
+			ginkgo.Entry("Should succeed to create trainingRuntime",
+				func() *kubeflowv2.TrainingRuntime {
+					return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").
+						Obj()
+				},
+				gomega.Succeed()),
+			ginkgo.Entry("Should succeed to create clusterTrainingRuntime",
+				func() *kubeflowv2.TrainingRuntime {
+					return testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").
+						Obj()
+				},
+				gomega.Succeed()),
+			ginkgo.Entry("Should fail to create trainingRuntime with both MPI and Torch runtimes",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{},
+							MPI:   &kubeflowv2.MPIMLPolicySource{},
+						},
+					}
+					return runtime
+				},
+				testingutil.BeInvalidError()),
+			ginkgo.Entry("Should fail to create trainingRuntime with minNodes and torch.elasticPolicy",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						NumNodes: ptr.To(int32(2)),
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{
+								ElasticPolicy: &kubeflowv2.TorchElasticPolicy{},
+							},
+						},
+					}
+					return runtime
+				},
+				testingutil.BeInvalidError()),
+		)
+		ginkgo.DescribeTable("Defaulting TrainingRuntime on creation", func(trainingRuntime func() *kubeflowv2.TrainingRuntime, wantTrainingRuntime func() *kubeflowv2.TrainingRuntime) {
+			created := trainingRuntime()
+			gomega.Expect(k8sClient.Create(ctx, created)).Should(gomega.Succeed())
+			gomega.Expect(created).Should(gomega.BeComparableTo(wantTrainingRuntime(), util.IgnoreObjectMetadata))
+		},
+			ginkgo.Entry("Should succeed to default TorchMLPolicySource.NumProcPerNode=auto",
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{},
+						},
+					}
+					return runtime
+				},
+				func() *kubeflowv2.TrainingRuntime {
+					runtime := testingutil.MakeTrainingRuntimeWrapper(ns.Name, "runtime").Obj()
+					runtime.Spec.MLPolicy = &kubeflowv2.MLPolicy{
+						MLPolicySource: kubeflowv2.MLPolicySource{
+							Torch: &kubeflowv2.TorchMLPolicySource{
+								NumProcPerNode: ptr.To("auto"),
+							},
+						},
+					}
+					return runtime
+				}),
+		)
+	})
+})
diff --git a/test/integration/controller.v2/trainjob_controller_test.go b/test/integration/controller.v2/trainjob_controller_test.go
index 098ae39c39..cc7f85c158 100644
--- a/test/integration/controller.v2/trainjob_controller_test.go
+++ b/test/integration/controller.v2/trainjob_controller_test.go
@@ -134,6 +134,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() {
 						MinResources(corev1.ResourceList{
 							corev1.ResourceCPU: resource.MustParse("1500"),
 						}).
+						SchedulingTimeout(60).
 						ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), trainJobKey.Name, string(trainJob.UID)).
 						Obj(),
 					util.IgnoreObjectMetadata))
@@ -189,6 +190,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() {
 						MinResources(corev1.ResourceList{
 							corev1.ResourceCPU: resource.MustParse("1500"),
 						}).
+						SchedulingTimeout(60).
 						ControllerReference(kubeflowv2.SchemeGroupVersion.WithKind(kubeflowv2.TrainJobKind), trainJobKey.Name, string(trainJob.UID)).
 						Obj(),
 					util.IgnoreObjectMetadata))