diff --git a/cmd/scheduler/main.go b/cmd/scheduler/main.go index 9085bf02a..32a0dcd67 100644 --- a/cmd/scheduler/main.go +++ b/cmd/scheduler/main.go @@ -61,6 +61,7 @@ func init() { rootCmd.Flags().StringVar(&config.NodeSchedulerPolicy, "node-scheduler-policy", policy.NodeSchedulerPolicyBinpack.String(), "node scheduler policy") rootCmd.Flags().StringVar(&config.GPUSchedulerPolicy, "gpu-scheduler-policy", policy.GPUSchedulerPolicySpread.String(), "GPU scheduler policy") rootCmd.Flags().StringVar(&config.MetricsBindAddress, "metrics-bind-address", ":9395", "The TCP address that the scheduler should bind to for serving prometheus metrics(e.g. 127.0.0.1:9395, :9395)") + rootCmd.Flags().BoolVar(&config.MemoryUserGBUnit, "memory-gb-unit", false, "set memory resource unit is Gi") rootCmd.PersistentFlags().AddGoFlagSet(device.GlobalFlagSet()) rootCmd.AddCommand(version.VersionCmd) rootCmd.Flags().AddGoFlagSet(util.InitKlogFlags()) diff --git a/pkg/device/ascend/ascend310p.go b/pkg/device/ascend/ascend310p.go index 2c4a0253f..6323d2db2 100644 --- a/pkg/device/ascend/ascend310p.go +++ b/pkg/device/ascend/ascend310p.go @@ -246,3 +246,13 @@ func (dev *Ascend310P) GenerateResourceRequests(ctr *corev1.Container) util.Cont } return util.ContainerDeviceRequest{} } + +func (dev *Ascend310P) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)]; ok { + resources.Limits[corev1.ResourceName(Ascend310PResourceMemory)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)]; ok { + resources.Requests[corev1.ResourceName(Ascend310PResourceMemory)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/ascend/device.go b/pkg/device/ascend/device.go index 9671b2a5c..4cfc15098 100644 --- a/pkg/device/ascend/device.go +++ b/pkg/device/ascend/device.go @@ -231,3 +231,13 @@ func (dev *AscendDevices) GenerateResourceRequests(ctr *corev1.Container) util.C } return util.ContainerDeviceRequest{} } + +func (dev *AscendDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(AscendResourceMemory)]; ok { + resources.Limits[corev1.ResourceName(AscendResourceMemory)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(AscendResourceMemory)]; ok { + resources.Requests[corev1.ResourceName(AscendResourceMemory)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/cambricon/device.go b/pkg/device/cambricon/device.go index 14e9e157f..0f0760f7d 100644 --- a/pkg/device/cambricon/device.go +++ b/pkg/device/cambricon/device.go @@ -299,3 +299,7 @@ func (dev *CambriconDevices) PatchAnnotations(annoinput *map[string]string, pd u } return *annoinput } + +func (dev *CambriconDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + return resources +} diff --git a/pkg/device/devices.go b/pkg/device/devices.go index 7d2359ee8..e452162c3 100644 --- a/pkg/device/devices.go +++ b/pkg/device/devices.go @@ -39,6 +39,7 @@ import ( type Devices interface { MutateAdmission(ctr *corev1.Container) (bool, error) + ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements CheckHealth(devType string, n *corev1.Node) (bool, bool) NodeCleanUp(nn string) error GetNodeDevices(n corev1.Node) ([]*api.DeviceInfo, error) diff --git a/pkg/device/hygon/device.go b/pkg/device/hygon/device.go index 9b85e4948..4f73a92b0 100644 --- a/pkg/device/hygon/device.go +++ b/pkg/device/hygon/device.go @@ -25,6 +25,7 @@ import ( "github.com/Project-HAMi/HAMi/pkg/util" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/klog/v2" ) @@ -233,3 +234,13 @@ func (dev *DCUDevices) PatchAnnotations(annoinput *map[string]string, pd util.Po } return *annoinput } + +func (dev *DCUDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(HygonResourceMemory)]; ok { + resources.Limits[corev1.ResourceName(HygonResourceMemory)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(HygonResourceMemory)]; ok { + resources.Requests[corev1.ResourceName(HygonResourceMemory)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/iluvatar/device.go b/pkg/device/iluvatar/device.go index 194ccfda9..5d361772e 100644 --- a/pkg/device/iluvatar/device.go +++ b/pkg/device/iluvatar/device.go @@ -213,3 +213,7 @@ func (dev *IluvatarDevices) GenerateResourceRequests(ctr *corev1.Container) util } return util.ContainerDeviceRequest{} } + +func (dev *IluvatarDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + return resources +} diff --git a/pkg/device/nvidia/device.go b/pkg/device/nvidia/device.go index 86db3dd1f..62ebd3f80 100644 --- a/pkg/device/nvidia/device.go +++ b/pkg/device/nvidia/device.go @@ -319,3 +319,13 @@ func (dev *NvidiaGPUDevices) GenerateResourceRequests(ctr *corev1.Container) uti } return util.ContainerDeviceRequest{} } + +func (dev *NvidiaGPUDevices) ResourceMemoryUnitConversion(resources corev1.ResourceRequirements) corev1.ResourceRequirements { + if v, ok := resources.Limits[corev1.ResourceName(ResourceMem)]; ok { + resources.Limits[corev1.ResourceName(ResourceMem)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + if v, ok := resources.Requests[corev1.ResourceName(ResourceMem)]; ok { + resources.Requests[corev1.ResourceName(ResourceMem)] = *resource.NewQuantity(int64(v.AsApproximateFloat64()*1000), resource.BinarySI) + } + return resources +} diff --git a/pkg/device/nvidia/device_test.go b/pkg/device/nvidia/device_test.go index b48fe2668..c800b1906 100644 --- a/pkg/device/nvidia/device_test.go +++ b/pkg/device/nvidia/device_test.go @@ -208,6 +208,7 @@ func Test_CheckUUID(t *testing.T) { } func Test_CheckType(t *testing.T) { + gpuDevices := &NvidiaGPUDevices{} tests := []struct { name string @@ -275,3 +276,89 @@ func Test_CheckType(t *testing.T) { }) } } + +func Test_ResourceMemoryUnitConversion(t *testing.T) { + tests := []struct { + name string + args corev1.ResourceRequirements + want corev1.ResourceRequirements + }{ + { + name: "not set memory field", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "cpu": resource.MustParse("1"), + "nvidia.com/gpu": resource.MustParse("2"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "cpu": resource.MustParse("1"), + "nvidia.com/gpu": resource.MustParse("2"), + }, + }, + }, + { + name: "memory limits set 1Gi", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1000"), + }, + }, + }, + { + name: "memory limits set 0.001Gi", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("0.001"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + }, + }, + + { + name: "memory limits and request set 0.001Gi", + args: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + Requests: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("0.001"), + }, + }, + want: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1000"), + }, + Requests: corev1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpumem": resource.MustParse("1"), + }, + }, + }, + } + gpuDevices := &NvidiaGPUDevices{} + ResourceMem = "nvidia.com/gpumem" + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + got := gpuDevices.ResourceMemoryUnitConversion(test.args) + assert.DeepEqual(t, got, test.want) + }) + } +} diff --git a/pkg/scheduler/config/config.go b/pkg/scheduler/config/config.go index 620a007d6..d030850fd 100644 --- a/pkg/scheduler/config/config.go +++ b/pkg/scheduler/config/config.go @@ -30,4 +30,6 @@ var ( NodeSchedulerPolicy = policy.NodeSchedulerPolicyBinpack.String() // GPUSchedulerPolicy is config this scheduler GPU to use `binpack` or `spread`. default value is spread. GPUSchedulerPolicy = policy.GPUSchedulerPolicySpread.String() + // MemoryUserGBUnit set memory resource unit is Gi. + MemoryUserGBUnit bool = false ) diff --git a/pkg/scheduler/webhook.go b/pkg/scheduler/webhook.go index dd72727b9..be9084cd0 100644 --- a/pkg/scheduler/webhook.go +++ b/pkg/scheduler/webhook.go @@ -71,6 +71,9 @@ func (h *webhook) Handle(_ context.Context, req admission.Request) admission.Res } } for _, val := range device.GetDevices() { + if config.MemoryUserGBUnit { + c.Resources = val.ResourceMemoryUnitConversion(c.Resources) + } found, err := val.MutateAdmission(c) if err != nil { klog.Errorf("validating pod failed:%s", err.Error())